From: kaf24@localhost.localdomain Date: Mon, 28 Aug 2006 11:09:36 +0000 (+0100) Subject: [XEN] Rename shadow2 to shadow and move the various source X-Git-Tag: archive/raspbian/4.8.0-1+rpi1~1^2~15681^2~9 X-Git-Url: https://dgit.raspbian.org/%22http:/www.example.com/cgi/%22https:/%22bookmarks:/%22man:///%22http:/www.example.com/cgi/%22https:/%22bookmarks:/%22man:/?a=commitdiff_plain;h=99deb3f2fb35722fc11798ea59910ea37fd1562c;p=xen.git [XEN] Rename shadow2 to shadow and move the various source files into a sensible directory hierarchy. Signed-off-by: Keir Fraser --- diff --git a/tools/libxc/xc_hvm_build.c b/tools/libxc/xc_hvm_build.c index c39ffa323f..dce154d7b7 100644 --- a/tools/libxc/xc_hvm_build.c +++ b/tools/libxc/xc_hvm_build.c @@ -441,7 +441,7 @@ static int xc_hvm_build_internal(int xc_handle, goto error_out; } - /* HVM domains must be put into shadow2 mode at the start of day */ + /* HVM domains must be put into shadow mode at the start of day */ if ( xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_ENABLE, NULL, 0, NULL, XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT | diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile index 6c3552c6c9..991fa436b3 100644 --- a/xen/arch/x86/Makefile +++ b/xen/arch/x86/Makefile @@ -2,6 +2,7 @@ subdir-y += acpi subdir-y += cpu subdir-y += genapic subdir-y += hvm +subdir-y += mm subdir-y += oprofile subdir-$(x86_32) += x86_32 @@ -41,23 +42,6 @@ obj-y += traps.o obj-y += usercopy.o obj-y += x86_emulate.o -ifneq ($(pae),n) -obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s3.o shadow2_g3_on_s3.o -else -obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s2.o -endif - -obj-$(x86_64) += shadow2-common.o shadow2_g4_on_s4.o shadow2_g3_on_s3.o \ - shadow2_g2_on_s3.o - -guest_levels = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1)))))) -shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1)))))) -shadow2_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \ - -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1)) - -shadow2_%.o: shadow2.c $(HDRS) Makefile - $(CC) $(CFLAGS) $(call shadow2_defns,$(@F)) -c $< -o $@ - obj-$(crash_debug) += gdbstub.o $(TARGET): $(TARGET)-syms boot/mkelf32 @@ -86,9 +70,6 @@ xen.lds: $(TARGET_SUBARCH)/xen.lds.S $(HDRS) boot/mkelf32: boot/mkelf32.c $(HOSTCC) $(HOSTCFLAGS) -o $@ $< -shadow_guest32.o: shadow.c -shadow_guest32pae.o: shadow.c - .PHONY: clean clean:: rm -f asm-offsets.s xen.lds boot/*.o boot/*~ boot/core boot/mkelf32 diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index f2bb9a5920..7acb3b7cea 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -200,12 +200,12 @@ int arch_domain_create(struct domain *d) #endif /* __x86_64__ */ - shadow2_lock_init(d); - for ( i = 0; i <= SHADOW2_MAX_ORDER; i++ ) - INIT_LIST_HEAD(&d->arch.shadow2.freelists[i]); - INIT_LIST_HEAD(&d->arch.shadow2.p2m_freelist); - INIT_LIST_HEAD(&d->arch.shadow2.p2m_inuse); - INIT_LIST_HEAD(&d->arch.shadow2.toplevel_shadows); + shadow_lock_init(d); + for ( i = 0; i <= SHADOW_MAX_ORDER; i++ ) + INIT_LIST_HEAD(&d->arch.shadow.freelists[i]); + INIT_LIST_HEAD(&d->arch.shadow.p2m_freelist); + INIT_LIST_HEAD(&d->arch.shadow.p2m_inuse); + INIT_LIST_HEAD(&d->arch.shadow.toplevel_shadows); if ( !is_idle_domain(d) ) { @@ -236,7 +236,7 @@ int arch_domain_create(struct domain *d) void arch_domain_destroy(struct domain *d) { - shadow2_final_teardown(d); + shadow_final_teardown(d); free_xenheap_pages( d->arch.mm_perdomain_pt, @@ -342,10 +342,10 @@ int arch_set_info_guest( } } - /* Shadow2: make sure the domain has enough shadow memory to + /* Shadow: make sure the domain has enough shadow memory to * boot another vcpu */ - if ( shadow2_mode_enabled(d) - && d->arch.shadow2.total_pages < shadow2_min_acceptable_pages(d) ) + if ( shadow_mode_enabled(d) + && d->arch.shadow.total_pages < shadow_min_acceptable_pages(d) ) { destroy_gdt(v); return -ENOMEM; @@ -357,8 +357,8 @@ int arch_set_info_guest( /* Don't redo final setup */ set_bit(_VCPUF_initialised, &v->vcpu_flags); - if ( shadow2_mode_enabled(d) ) - shadow2_update_paging_modes(v); + if ( shadow_mode_enabled(d) ) + shadow_update_paging_modes(v); update_cr3(v); @@ -936,11 +936,11 @@ void domain_relinquish_resources(struct domain *d) for_each_vcpu ( d, v ) { /* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling, - * or sh2_update_paging_modes()) */ + * or sh_update_paging_modes()) */ pfn = pagetable_get_pfn(v->arch.guest_table); if ( pfn != 0 ) { - if ( shadow2_mode_refcounts(d) ) + if ( shadow_mode_refcounts(d) ) put_page(mfn_to_page(pfn)); else put_page_and_type(mfn_to_page(pfn)); @@ -962,7 +962,7 @@ void domain_relinquish_resources(struct domain *d) hvm_relinquish_guest_resources(d); /* Tear down shadow mode stuff. */ - shadow2_teardown(d); + shadow_teardown(d); /* * Relinquish GDT mappings. No need for explicit unmapping of the LDT as @@ -981,18 +981,18 @@ void domain_relinquish_resources(struct domain *d) void arch_dump_domain_info(struct domain *d) { - if ( shadow2_mode_enabled(d) ) + if ( shadow_mode_enabled(d) ) { - printk(" shadow2 mode: "); - if ( d->arch.shadow2.mode & SHM2_enable ) + printk(" shadow mode: "); + if ( d->arch.shadow.mode & SHM2_enable ) printk("enabled "); - if ( shadow2_mode_refcounts(d) ) + if ( shadow_mode_refcounts(d) ) printk("refcounts "); - if ( shadow2_mode_log_dirty(d) ) + if ( shadow_mode_log_dirty(d) ) printk("log_dirty "); - if ( shadow2_mode_translate(d) ) + if ( shadow_mode_translate(d) ) printk("translate "); - if ( shadow2_mode_external(d) ) + if ( shadow_mode_external(d) ) printk("external "); printk("\n"); } diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index adb903762e..dd44e827f4 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -679,8 +679,8 @@ int construct_dom0(struct domain *d, (void)alloc_vcpu(d, i, i); /* Set up CR3 value for write_ptbase */ - if ( shadow2_mode_enabled(v->domain) ) - shadow2_update_paging_modes(v); + if ( shadow_mode_enabled(v->domain) ) + shadow_update_paging_modes(v); else update_cr3(v); @@ -791,8 +791,8 @@ int construct_dom0(struct domain *d, new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start); if ( opt_dom0_shadow ) - if ( shadow2_test_enable(d) == 0 ) - shadow2_update_paging_modes(v); + if ( shadow_test_enable(d) == 0 ) + shadow_update_paging_modes(v); if ( supervisor_mode_kernel ) { diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c index 45caf069e9..912a7c4252 100644 --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -39,7 +39,7 @@ long arch_do_domctl( d = find_domain_by_id(domctl->domain); if ( d != NULL ) { - ret = shadow2_domctl(d, &domctl->u.shadow_op, u_domctl); + ret = shadow_domctl(d, &domctl->u.shadow_op, u_domctl); put_domain(d); copy_to_guest(u_domctl, domctl, 1); } diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index 5dc4111171..0a7aa015d4 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -384,8 +384,8 @@ int hvm_copy(void *buf, unsigned long vaddr, int size, int dir) if (count > size) count = size; - gfn = shadow2_gva_to_gfn(v, vaddr); - mfn = mfn_x(sh2_vcpu_gfn_to_mfn(v, gfn)); + gfn = shadow_gva_to_gfn(v, vaddr); + mfn = mfn_x(sh_vcpu_gfn_to_mfn(v, gfn)); if (mfn == INVALID_MFN) return 0; @@ -539,7 +539,7 @@ void hvm_do_hypercall(struct cpu_user_regs *pregs) return; } - if ( current->arch.shadow2.mode->guest_levels == 4 ) + if ( current->arch.shadow.mode->guest_levels == 4 ) { pregs->rax = hvm_hypercall64_table[pregs->rax](pregs->rdi, pregs->rsi, diff --git a/xen/arch/x86/hvm/platform.c b/xen/arch/x86/hvm/platform.c index 920e7786a0..d5fb545728 100644 --- a/xen/arch/x86/hvm/platform.c +++ b/xen/arch/x86/hvm/platform.c @@ -721,7 +721,7 @@ void send_pio_req(struct cpu_user_regs *regs, unsigned long port, if (pvalid) { if (hvm_paging_enabled(current)) - p->u.data = shadow2_gva_to_gpa(current, value); + p->u.data = shadow_gva_to_gpa(current, value); else p->u.pdata = (void *) value; /* guest VA == guest PA */ } else @@ -771,7 +771,7 @@ void send_mmio_req( if (pvalid) { if (hvm_paging_enabled(v)) - p->u.data = shadow2_gva_to_gpa(v, value); + p->u.data = shadow_gva_to_gpa(v, value); else p->u.pdata = (void *) value; /* guest VA == guest PA */ } else diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index 3378d3879e..e68d66b0de 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include @@ -746,10 +746,10 @@ static void svm_final_setup_guest(struct vcpu *v) if ( v != d->vcpu[0] ) return; - if ( !shadow2_mode_external(d) ) + if ( !shadow_mode_external(d) ) { DPRINTK("Can't init HVM for dom %u vcpu %u: " - "not in shadow2 external mode\n", d->domain_id, v->vcpu_id); + "not in shadow external mode\n", d->domain_id, v->vcpu_id); domain_crash(d); } @@ -914,7 +914,7 @@ static int svm_do_page_fault(unsigned long va, struct cpu_user_regs *regs) va, eip, (unsigned long)regs->error_code); //#endif - result = shadow2_fault(va, regs); + result = shadow_fault(va, regs); if( result ) { /* Let's make sure that the Guest TLB is flushed */ @@ -1562,7 +1562,7 @@ static int svm_set_cr0(unsigned long value) v->arch.guest_table = pagetable_from_pfn(mfn); if ( old_base_mfn ) put_page(mfn_to_page(old_base_mfn)); - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", (unsigned long) (mfn << PAGE_SHIFT)); @@ -1588,14 +1588,14 @@ static int svm_set_cr0(unsigned long value) svm_inject_exception(v, TRAP_gp_fault, 1, 0); return 0; } - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); } else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE ) { /* we should take care of this kind of situation */ - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); } @@ -1706,7 +1706,7 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs) mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); if (mfn != pagetable_get_pfn(v->arch.guest_table)) __hvm_bug(regs); - shadow2_update_cr3(v); + shadow_update_cr3(v); } else { @@ -1771,7 +1771,7 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs) v->arch.guest_table = pagetable_from_pfn(mfn); if ( old_base_mfn ) put_page(mfn_to_page(old_base_mfn)); - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", (unsigned long) (mfn << PAGE_SHIFT)); @@ -1808,7 +1808,7 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs) if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) { set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); } break; } @@ -2149,7 +2149,7 @@ void svm_handle_invlpg(const short invlpga, struct cpu_user_regs *regs) /* Overkill, we may not this */ set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); - shadow2_invlpg(v, g_vaddr); + shadow_invlpg(v, g_vaddr); } @@ -2520,7 +2520,7 @@ void walk_shadow_and_guest_pt(unsigned long gva) struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned long gpa; - gpa = shadow2_gva_to_gpa(current, gva); + gpa = shadow_gva_to_gpa(current, gva); printk( "gva = %lx, gpa=%lx, gCR3=%x\n", gva, gpa, (u32)vmcb->cr3 ); if( !svm_paging_enabled(v) || mmio_space(gpa) ) return; @@ -2591,7 +2591,7 @@ asmlinkage void svm_vmexit_handler(struct cpu_user_regs regs) if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF) { if (svm_paging_enabled(v) && - !mmio_space(shadow2_gva_to_gpa(current, vmcb->exitinfo2))) + !mmio_space(shadow_gva_to_gpa(current, vmcb->exitinfo2))) { printk("I%08ld,ExC=%s(%d),IP=%x:%llx," "I1=%llx,I2=%llx,INT=%llx, " @@ -2601,7 +2601,7 @@ asmlinkage void svm_vmexit_handler(struct cpu_user_regs regs) (unsigned long long) vmcb->exitinfo1, (unsigned long long) vmcb->exitinfo2, (unsigned long long) vmcb->exitintinfo.bytes, - (unsigned long long) shadow2_gva_to_gpa(current, vmcb->exitinfo2)); + (unsigned long long) shadow_gva_to_gpa(current, vmcb->exitinfo2)); } else { diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index 75de5f49ea..5c80d7e89a 100644 --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include static int vmcs_size; static int vmcs_order; @@ -272,7 +272,7 @@ static void vmx_do_launch(struct vcpu *v) error |= __vmwrite(GUEST_TR_BASE, 0); error |= __vmwrite(GUEST_TR_LIMIT, 0xff); - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); printk("%s(): GUEST_CR3<=%08lx, HOST_CR3<=%08lx\n", __func__, v->arch.hvm_vcpu.hw_cr3, v->arch.cr3); __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index c0a1616c0a..5060ddd04f 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include #include @@ -66,10 +66,10 @@ static int vmx_initialize_guest_resources(struct vcpu *v) if ( v->vcpu_id != 0 ) return 1; - if ( !shadow2_mode_external(d) ) + if ( !shadow_mode_external(d) ) { DPRINTK("Can't init HVM for dom %u vcpu %u: " - "not in shadow2 external mode\n", + "not in shadow external mode\n", d->domain_id, v->vcpu_id); domain_crash(d); } @@ -865,7 +865,7 @@ static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs) } #endif - result = shadow2_fault(va, regs); + result = shadow_fault(va, regs); TRACE_VMEXIT (2,result); #if 0 @@ -1039,7 +1039,7 @@ static void vmx_vmexit_do_invlpg(unsigned long va) * We do the safest things first, then try to update the shadow * copying from guest */ - shadow2_invlpg(v, va); + shadow_invlpg(v, va); } @@ -1301,7 +1301,7 @@ vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c) skip_cr3: - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); if (!vmx_paging_enabled(v)) HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table"); else @@ -1504,7 +1504,7 @@ static int vmx_set_cr0(unsigned long value) v->arch.guest_table = pagetable_from_pfn(mfn); if (old_base_mfn) put_page(mfn_to_page(old_base_mfn)); - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", (unsigned long) (mfn << PAGE_SHIFT)); @@ -1577,7 +1577,7 @@ static int vmx_set_cr0(unsigned long value) else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE ) { __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); } return 1; @@ -1662,7 +1662,7 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs) mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); if (mfn != pagetable_get_pfn(v->arch.guest_table)) __hvm_bug(regs); - shadow2_update_cr3(v); + shadow_update_cr3(v); } else { /* * If different, make a shadow. Check if the PDBR is valid @@ -1755,7 +1755,7 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs) * all TLB entries except global entries. */ if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); break; } default: diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index b688bd90f2..541938c073 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -454,12 +454,12 @@ int map_ldt_shadow_page(unsigned int off) res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page); - if ( !res && unlikely(shadow2_mode_refcounts(d)) ) + if ( !res && unlikely(shadow_mode_refcounts(d)) ) { - shadow2_lock(d); - shadow2_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0); + shadow_lock(d); + shadow_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0); res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page); - shadow2_unlock(d); + shadow_unlock(d); } if ( unlikely(!res) ) @@ -527,7 +527,7 @@ get_linear_pagetable( struct page_info *page; unsigned long pfn; - ASSERT( !shadow2_mode_refcounts(d) ); + ASSERT( !shadow_mode_refcounts(d) ); if ( (root_get_flags(re) & _PAGE_RW) ) { @@ -602,12 +602,12 @@ get_page_from_l1e( d = dom_io; } - /* Foreign mappings into guests in shadow2 external mode don't + /* Foreign mappings into guests in shadow external mode don't * contribute to writeable mapping refcounts. (This allows the * qemu-dm helper process in dom0 to map the domain's memory without * messing up the count of "real" writable mappings.) */ okay = (((l1e_get_flags(l1e) & _PAGE_RW) && - !(unlikely(shadow2_mode_external(d) && (d != current->domain)))) + !(unlikely(shadow_mode_external(d) && (d != current->domain)))) ? get_page_and_type(page, d, PGT_writable_page) : get_page(page, d)); if ( !okay ) @@ -771,9 +771,9 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) } /* Remember we didn't take a type-count of foreign writable mappings - * to shadow2 external domains */ + * to shadow external domains */ if ( (l1e_get_flags(l1e) & _PAGE_RW) && - !(unlikely((e != d) && shadow2_mode_external(e))) ) + !(unlikely((e != d) && shadow_mode_external(e))) ) { put_page_and_type(page); } @@ -830,7 +830,7 @@ static int alloc_l1_table(struct page_info *page) l1_pgentry_t *pl1e; int i; - ASSERT(!shadow2_mode_refcounts(d)); + ASSERT(!shadow_mode_refcounts(d)); pl1e = map_domain_page(pfn); @@ -883,7 +883,7 @@ static int create_pae_xen_mappings(l3_pgentry_t *pl3e) * a. alloc_l3_table() calls this function and this check will fail * b. mod_l3_entry() disallows updates to slot 3 in an existing table * - * XXX -- this needs revisiting for shadow2_mode_refcount()==true... + * XXX -- this needs revisiting for shadow_mode_refcount()==true... */ page = l3e_get_page(l3e3); BUG_ON(page->u.inuse.type_info & PGT_pinned); @@ -1007,7 +1007,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) l2_pgentry_t *pl2e; int i; - ASSERT(!shadow2_mode_refcounts(d)); + ASSERT(!shadow_mode_refcounts(d)); pl2e = map_domain_page(pfn); @@ -1059,7 +1059,7 @@ static int alloc_l3_table(struct page_info *page, unsigned long type) l3_pgentry_t *pl3e; int i; - ASSERT(!shadow2_mode_refcounts(d)); + ASSERT(!shadow_mode_refcounts(d)); #ifdef CONFIG_X86_PAE /* @@ -1120,7 +1120,7 @@ static int alloc_l4_table(struct page_info *page, unsigned long type) unsigned long vaddr; int i; - ASSERT(!shadow2_mode_refcounts(d)); + ASSERT(!shadow_mode_refcounts(d)); for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ ) { @@ -1234,8 +1234,8 @@ static inline int update_l1e(l1_pgentry_t *pl1e, struct vcpu *v) { int rv = 1; - if ( unlikely(shadow2_mode_enabled(v->domain)) ) - shadow2_lock(v->domain); + if ( unlikely(shadow_mode_enabled(v->domain)) ) + shadow_lock(v->domain); #ifndef PTE_UPDATE_WITH_CMPXCHG rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e))); #else @@ -1266,10 +1266,10 @@ static inline int update_l1e(l1_pgentry_t *pl1e, } } #endif - if ( unlikely(shadow2_mode_enabled(v->domain)) ) + if ( unlikely(shadow_mode_enabled(v->domain)) ) { - shadow2_validate_guest_entry(v, _mfn(gl1mfn), pl1e); - shadow2_unlock(v->domain); + shadow_validate_guest_entry(v, _mfn(gl1mfn), pl1e); + shadow_unlock(v->domain); } return rv; } @@ -1339,13 +1339,13 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, #endif #define UPDATE_ENTRY(_t,_p,_o,_n,_m) ({ \ int rv; \ - if ( unlikely(shadow2_mode_enabled(current->domain)) ) \ - shadow2_lock(current->domain); \ + if ( unlikely(shadow_mode_enabled(current->domain)) ) \ + shadow_lock(current->domain); \ rv = _UPDATE_ENTRY(_t, _p, _o, _n); \ - if ( unlikely(shadow2_mode_enabled(current->domain)) ) \ + if ( unlikely(shadow_mode_enabled(current->domain)) ) \ { \ - shadow2_validate_guest_entry(current, _mfn(_m), (_p)); \ - shadow2_unlock(current->domain); \ + shadow_validate_guest_entry(current, _mfn(_m), (_p)); \ + shadow_unlock(current->domain); \ } \ rv; \ }) @@ -1581,21 +1581,21 @@ void free_page_type(struct page_info *page, unsigned long type) */ this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS; - if ( unlikely(shadow2_mode_enabled(owner) - && !shadow2_lock_is_acquired(owner)) ) + if ( unlikely(shadow_mode_enabled(owner) + && !shadow_lock_is_acquired(owner)) ) { /* Raw page tables are rewritten during save/restore. */ - if ( !shadow2_mode_translate(owner) ) + if ( !shadow_mode_translate(owner) ) mark_dirty(owner, page_to_mfn(page)); - if ( shadow2_mode_refcounts(owner) ) + if ( shadow_mode_refcounts(owner) ) return; gmfn = mfn_to_gmfn(owner, page_to_mfn(page)); ASSERT(VALID_M2P(gmfn)); - shadow2_lock(owner); - shadow2_remove_all_shadows(owner->vcpu[0], _mfn(gmfn)); - shadow2_unlock(owner); + shadow_lock(owner); + shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn)); + shadow_unlock(owner); } } @@ -1760,7 +1760,7 @@ int get_page_type(struct page_info *page, unsigned long type) #endif /* Fixme: add code to propagate va_unknown to subtables. */ if ( ((type & PGT_type_mask) >= PGT_l2_page_table) && - !shadow2_mode_refcounts(page_get_owner(page)) ) + !shadow_mode_refcounts(page_get_owner(page)) ) return 0; /* This table is possibly mapped at multiple locations. */ nx &= ~PGT_va_mask; @@ -1810,7 +1810,7 @@ int new_guest_cr3(unsigned long mfn) if ( hvm_guest(v) && !hvm_paging_enabled(v) ) domain_crash_synchronous(); - if ( shadow2_mode_refcounts(d) ) + if ( shadow_mode_refcounts(d) ) { okay = get_page_from_pagenr(mfn, d); if ( unlikely(!okay) ) @@ -1858,7 +1858,7 @@ int new_guest_cr3(unsigned long mfn) if ( likely(old_base_mfn != 0) ) { - if ( shadow2_mode_refcounts(d) ) + if ( shadow_mode_refcounts(d) ) put_page(mfn_to_page(old_base_mfn)); else put_page_and_type(mfn_to_page(old_base_mfn)); @@ -2043,7 +2043,7 @@ int do_mmuext_op( type = PGT_root_page_table; pin_page: - if ( shadow2_mode_refcounts(FOREIGNDOM) ) + if ( shadow_mode_refcounts(FOREIGNDOM) ) break; okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM); @@ -2065,7 +2065,7 @@ int do_mmuext_op( break; case MMUEXT_UNPIN_TABLE: - if ( shadow2_mode_refcounts(d) ) + if ( shadow_mode_refcounts(d) ) break; if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) ) @@ -2078,11 +2078,11 @@ int do_mmuext_op( { put_page_and_type(page); put_page(page); - if ( shadow2_mode_enabled(d) ) + if ( shadow_mode_enabled(d) ) { - shadow2_lock(d); - shadow2_remove_all_shadows(v, _mfn(mfn)); - shadow2_unlock(d); + shadow_lock(d); + shadow_remove_all_shadows(v, _mfn(mfn)); + shadow_unlock(d); } } else @@ -2125,8 +2125,8 @@ int do_mmuext_op( break; case MMUEXT_INVLPG_LOCAL: - if ( !shadow2_mode_enabled(d) - || shadow2_invlpg(v, op.arg1.linear_addr) != 0 ) + if ( !shadow_mode_enabled(d) + || shadow_invlpg(v, op.arg1.linear_addr) != 0 ) local_flush_tlb_one(op.arg1.linear_addr); break; @@ -2173,7 +2173,7 @@ int do_mmuext_op( unsigned long ptr = op.arg1.linear_addr; unsigned long ents = op.arg2.nr_ents; - if ( shadow2_mode_external(d) ) + if ( shadow_mode_external(d) ) { MEM_LOG("ignoring SET_LDT hypercall from external " "domain %u", d->domain_id); @@ -2319,7 +2319,7 @@ int do_mmu_update( case PGT_l3_page_table: case PGT_l4_page_table: { - if ( shadow2_mode_refcounts(d) ) + if ( shadow_mode_refcounts(d) ) { DPRINTK("mmu update on shadow-refcounted domain!"); break; @@ -2372,16 +2372,16 @@ int do_mmu_update( if ( unlikely(!get_page_type(page, PGT_writable_page)) ) break; - if ( unlikely(shadow2_mode_enabled(d)) ) - shadow2_lock(d); + if ( unlikely(shadow_mode_enabled(d)) ) + shadow_lock(d); *(intpte_t *)va = req.val; okay = 1; - if ( unlikely(shadow2_mode_enabled(d)) ) + if ( unlikely(shadow_mode_enabled(d)) ) { - shadow2_validate_guest_entry(v, _mfn(mfn), va); - shadow2_unlock(d); + shadow_validate_guest_entry(v, _mfn(mfn), va); + shadow_unlock(d); } put_page_type(page); @@ -2405,8 +2405,8 @@ int do_mmu_update( break; } - if ( shadow2_mode_translate(FOREIGNDOM) ) - shadow2_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn); + if ( shadow_mode_translate(FOREIGNDOM) ) + shadow_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn); else set_gpfn_from_mfn(mfn, gpfn); okay = 1; @@ -2492,7 +2492,7 @@ static int create_grant_pte_mapping( goto failed; } - if ( !shadow2_mode_refcounts(d) ) + if ( !shadow_mode_refcounts(d) ) put_page_from_l1e(ol1e, d); put_page_type(page); @@ -2590,7 +2590,7 @@ static int create_grant_va_mapping( l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) ) return GNTST_general_error; - if ( !shadow2_mode_refcounts(d) ) + if ( !shadow_mode_refcounts(d) ) put_page_from_l1e(ol1e, d); return GNTST_okay; @@ -2714,10 +2714,10 @@ int do_update_va_mapping(unsigned long va, u64 val64, perfc_incrc(calls_to_update_va); - if ( unlikely(!__addr_ok(va) && !shadow2_mode_external(d)) ) + if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) ) return -EINVAL; - if ( unlikely(shadow2_mode_refcounts(d)) ) + if ( unlikely(shadow_mode_refcounts(d)) ) { DPRINTK("Grant op on a shadow-refcounted domain\n"); return -EINVAL; @@ -2725,11 +2725,11 @@ int do_update_va_mapping(unsigned long va, u64 val64, LOCK_BIGLOCK(d); - if ( likely(rc == 0) && unlikely(shadow2_mode_enabled(d)) ) + if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) ) { if ( unlikely(this_cpu(percpu_mm_info).foreign && - (shadow2_mode_translate(d) || - shadow2_mode_translate( + (shadow_mode_translate(d) || + shadow_mode_translate( this_cpu(percpu_mm_info).foreign))) ) { /* @@ -2770,8 +2770,8 @@ int do_update_va_mapping(unsigned long va, u64 val64, switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) ) { case UVMF_LOCAL: - if ( !shadow2_mode_enabled(d) - || (shadow2_invlpg(current, va) != 0) ) + if ( !shadow_mode_enabled(d) + || (shadow_invlpg(current, va) != 0) ) local_flush_tlb_one(va); break; case UVMF_ALL: @@ -3006,7 +3006,7 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg) break; } - if ( !shadow2_mode_translate(d) || (mfn == 0) ) + if ( !shadow_mode_translate(d) || (mfn == 0) ) { put_domain(d); return -EINVAL; @@ -3196,21 +3196,21 @@ static int ptwr_emulated_update( pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK)); if ( do_cmpxchg ) { - if ( shadow2_mode_enabled(d) ) - shadow2_lock(d); + if ( shadow_mode_enabled(d) ) + shadow_lock(d); ol1e = l1e_from_intpte(old); if ( cmpxchg((intpte_t *)pl1e, old, val) != old ) { - if ( shadow2_mode_enabled(d) ) - shadow2_unlock(d); + if ( shadow_mode_enabled(d) ) + shadow_unlock(d); unmap_domain_page(pl1e); put_page_from_l1e(nl1e, d); return X86EMUL_CMPXCHG_FAILED; } - if ( unlikely(shadow2_mode_enabled(v->domain)) ) + if ( unlikely(shadow_mode_enabled(v->domain)) ) { - shadow2_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e); - shadow2_unlock(v->domain); + shadow_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e); + shadow_unlock(v->domain); } } else diff --git a/xen/arch/x86/mm/Makefile b/xen/arch/x86/mm/Makefile new file mode 100644 index 0000000000..5043fd9501 --- /dev/null +++ b/xen/arch/x86/mm/Makefile @@ -0,0 +1 @@ +subdir-y += shadow diff --git a/xen/arch/x86/mm/shadow/Makefile b/xen/arch/x86/mm/shadow/Makefile new file mode 100644 index 0000000000..6de7cca484 --- /dev/null +++ b/xen/arch/x86/mm/shadow/Makefile @@ -0,0 +1,15 @@ +ifneq ($(pae),n) +obj-$(x86_32) += common.o g2_on_s3.o g3_on_s3.o +else +obj-$(x86_32) += common.o g2_on_s2.o +endif + +obj-$(x86_64) += common.o g4_on_s4.o g3_on_s3.o g2_on_s3.o + +guest_levels = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(1))))) +shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(1))))) +shadow_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \ + -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1)) + +g%.o: multi.c $(HDRS) Makefile + $(CC) $(CFLAGS) $(call shadow_defns,$(@F)) -c $< -o $@ diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c new file mode 100644 index 0000000000..8e7e9051b4 --- /dev/null +++ b/xen/arch/x86/mm/shadow/common.c @@ -0,0 +1,3407 @@ +/****************************************************************************** + * arch/x86/mm/shadow/common.c + * + * Shadow code that does not need to be multiply compiled. + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define SHADOW 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "private.h" + +#if SHADOW_AUDIT +int shadow_audit_enable = 0; + +static void shadow_audit_key(unsigned char key) +{ + shadow_audit_enable = !shadow_audit_enable; + printk("%s shadow_audit_enable=%d\n", + __func__, shadow_audit_enable); +} + +static int __init shadow_audit_key_init(void) +{ + register_keyhandler( + 'O', shadow_audit_key, "toggle shadow audits"); + return 0; +} +__initcall(shadow_audit_key_init); +#endif /* SHADOW_AUDIT */ + +static void sh_free_log_dirty_bitmap(struct domain *d); + +int _shadow_mode_refcounts(struct domain *d) +{ + return shadow_mode_refcounts(d); +} + + +/**************************************************************************/ +/* x86 emulator support for the shadow code + */ + +static int +sh_x86_emulate_read_std(unsigned long addr, + unsigned long *val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; + if ( hvm_guest(v) ) + { + *val = 0; + // XXX -- this is WRONG. + // It entirely ignores the permissions in the page tables. + // In this case, that is only a user vs supervisor access check. + // + if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) ) + { +#if 0 + SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, + addr, *val, bytes); +#endif + return X86EMUL_CONTINUE; + } + + /* If we got here, there was nothing mapped here, or a bad GFN + * was mapped here. This should never happen: we're here because + * of a write fault at the end of the instruction we're emulating. */ + SHADOW_PRINTK("read failed to va %#lx\n", addr); + return X86EMUL_PROPAGATE_FAULT; + } + else + { + SHADOW_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh_x86_emulate_write_std(unsigned long addr, + unsigned long val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, addr, val, bytes); +#endif + if ( hvm_guest(v) ) + { + // XXX -- this is WRONG. + // It entirely ignores the permissions in the page tables. + // In this case, that includes user vs supervisor, and + // write access. + // + if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) ) + return X86EMUL_CONTINUE; + + /* If we got here, there was nothing mapped here, or a bad GFN + * was mapped here. This should never happen: we're here because + * of a write fault at the end of the instruction we're emulating, + * which should be handled by sh_x86_emulate_write_emulated. */ + SHADOW_PRINTK("write failed to va %#lx\n", addr); + return X86EMUL_PROPAGATE_FAULT; + } + else + { + SHADOW_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh_x86_emulate_write_emulated(unsigned long addr, + unsigned long val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, addr, val, bytes); +#endif + if ( hvm_guest(v) ) + { + return v->arch.shadow.mode->x86_emulate_write(v, addr, &val, bytes, ctxt); + } + else + { + SHADOW_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh_x86_emulate_cmpxchg_emulated(unsigned long addr, + unsigned long old, + unsigned long new, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, addr, old, new, bytes); +#endif + if ( hvm_guest(v) ) + { + return v->arch.shadow.mode->x86_emulate_cmpxchg(v, addr, old, new, + bytes, ctxt); + } + else + { + SHADOW_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh_x86_emulate_cmpxchg8b_emulated(unsigned long addr, + unsigned long old_lo, + unsigned long old_hi, + unsigned long new_lo, + unsigned long new_hi, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n", + v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo, + new_hi, new_lo, ctxt); +#endif + if ( hvm_guest(v) ) + { + return v->arch.shadow.mode->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi, + new_lo, new_hi, ctxt); + } + else + { + SHADOW_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + + +struct x86_emulate_ops shadow_emulator_ops = { + .read_std = sh_x86_emulate_read_std, + .write_std = sh_x86_emulate_write_std, + .read_emulated = sh_x86_emulate_read_std, + .write_emulated = sh_x86_emulate_write_emulated, + .cmpxchg_emulated = sh_x86_emulate_cmpxchg_emulated, + .cmpxchg8b_emulated = sh_x86_emulate_cmpxchg8b_emulated, +}; + + +/**************************************************************************/ +/* Code for "promoting" a guest page to the point where the shadow code is + * willing to let it be treated as a guest page table. This generally + * involves making sure there are no writable mappings available to the guest + * for this page. + */ +void shadow_promote(struct vcpu *v, mfn_t gmfn, u32 type) +{ + struct page_info *page = mfn_to_page(gmfn); + unsigned long type_info; + + ASSERT(valid_mfn(gmfn)); + + /* We should never try to promote a gmfn that has writeable mappings */ + ASSERT(shadow_remove_write_access(v, gmfn, 0, 0) == 0); + + // Is the page already shadowed? + if ( !test_and_set_bit(_PGC_page_table, &page->count_info) ) + { + // No prior shadow exists... + + // Grab a type-ref. We don't really care if we are racing with another + // vcpu or not, or even what kind of type we get; we just want the type + // count to be > 0. + // + do { + type_info = + page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask); + } while ( !get_page_type(page, type_info) ); + + // Now that the type ref is non-zero, we can safely use the + // shadow_flags. + // + page->shadow_flags = 0; + } + + ASSERT(!test_bit(type >> PGC_SH_type_shift, &page->shadow_flags)); + set_bit(type >> PGC_SH_type_shift, &page->shadow_flags); +} + +void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type) +{ + struct page_info *page = mfn_to_page(gmfn); + + ASSERT(test_bit(_PGC_page_table, &page->count_info)); + ASSERT(test_bit(type >> PGC_SH_type_shift, &page->shadow_flags)); + + clear_bit(type >> PGC_SH_type_shift, &page->shadow_flags); + + if ( (page->shadow_flags & SHF_page_type_mask) == 0 ) + { + // release the extra type ref + put_page_type(page); + + // clear the is-a-page-table bit. + clear_bit(_PGC_page_table, &page->count_info); + } +} + +/**************************************************************************/ +/* Validate a pagetable change from the guest and update the shadows. + * Returns a bitmask of SHADOW_SET_* flags. */ + +static int +__shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, + void *entry, u32 size) +{ + int result = 0; + struct page_info *page = mfn_to_page(gmfn); + + sh_mark_dirty(v->domain, gmfn); + + // Determine which types of shadows are affected, and update each. + // + // Always validate L1s before L2s to prevent another cpu with a linear + // mapping of this gmfn from seeing a walk that results from + // using the new L2 value and the old L1 value. (It is OK for such a + // guest to see a walk that uses the old L2 value with the new L1 value, + // as hardware could behave this way if one level of the pagewalk occurs + // before the store, and the next level of the pagewalk occurs after the + // store. + // + // Ditto for L2s before L3s, etc. + // + + if ( !(page->count_info & PGC_page_table) ) + return 0; /* Not shadowed at all */ + +#if CONFIG_PAGING_LEVELS == 2 + if ( page->shadow_flags & SHF_L1_32 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2) + (v, gmfn, entry, size); +#else + if ( page->shadow_flags & SHF_L1_32 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2) + (v, gmfn, entry, size); +#endif + +#if CONFIG_PAGING_LEVELS == 2 + if ( page->shadow_flags & SHF_L2_32 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2) + (v, gmfn, entry, size); +#else + if ( page->shadow_flags & SHF_L2_32 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2) + (v, gmfn, entry, size); +#endif + +#if CONFIG_PAGING_LEVELS >= 3 + if ( page->shadow_flags & SHF_L1_PAE ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3) + (v, gmfn, entry, size); + if ( page->shadow_flags & SHF_L2_PAE ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3) + (v, gmfn, entry, size); + if ( page->shadow_flags & SHF_L2H_PAE ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3) + (v, gmfn, entry, size); + if ( page->shadow_flags & SHF_L3_PAE ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 3, 3) + (v, gmfn, entry, size); +#else /* 32-bit non-PAE hypervisor does not support PAE guests */ + ASSERT((page->shadow_flags & (SHF_L3_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0); +#endif + +#if CONFIG_PAGING_LEVELS >= 4 + if ( page->shadow_flags & SHF_L1_64 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4) + (v, gmfn, entry, size); + if ( page->shadow_flags & SHF_L2_64 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4) + (v, gmfn, entry, size); + if ( page->shadow_flags & SHF_L3_64 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4) + (v, gmfn, entry, size); + if ( page->shadow_flags & SHF_L4_64 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4) + (v, gmfn, entry, size); +#else /* 32-bit/PAE hypervisor does not support 64-bit guests */ + ASSERT((page->shadow_flags + & (SHF_L4_64|SHF_L3_64|SHF_L2_64|SHF_L1_64)) == 0); +#endif + + return result; +} + + +int +shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry) +/* This is the entry point from hypercalls. It returns a bitmask of all the + * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */ +{ + int rc; + + ASSERT(shadow_lock_is_acquired(v->domain)); + rc = __shadow_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t)); + shadow_audit_tables(v); + return rc; +} + +void +shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, + void *entry, u32 size) +/* This is the entry point for emulated writes to pagetables in HVM guests */ +{ + struct domain *d = v->domain; + int rc; + + ASSERT(shadow_lock_is_acquired(v->domain)); + rc = __shadow_validate_guest_entry(v, gmfn, entry, size); + if ( rc & SHADOW_SET_FLUSH ) + { + // Flush everyone except the local processor, which will flush when it + // re-enters the HVM guest. + // + cpumask_t mask = d->domain_dirty_cpumask; + cpu_clear(v->processor, mask); + flush_tlb_mask(mask); + } + if ( rc & SHADOW_SET_ERROR ) + { + /* This page is probably not a pagetable any more: tear it out of the + * shadows, along with any tables that reference it */ + shadow_remove_all_shadows_and_parents(v, gmfn); + } + /* We ignore the other bits: since we are about to change CR3 on + * VMENTER we don't need to do any extra TLB flushes. */ +} + + +/**************************************************************************/ +/* Memory management for shadow pages. */ + +/* Meaning of the count_info field in shadow pages + * ---------------------------------------------- + * + * A count of all references to this page from other shadow pages and + * guest CR3s (a.k.a. v->arch.shadow.table). + * + * The top bits hold the shadow type and the pinned bit. Top-level + * shadows are pinned so that they don't disappear when not in a CR3 + * somewhere. + * + * We don't need to use get|put_page for this as the updates are all + * protected by the shadow lock. We can't use get|put_page for this + * as the size of the count on shadow pages is different from that on + * normal guest pages. + */ + +/* Meaning of the type_info field in shadow pages + * ---------------------------------------------- + * + * type_info use depends on the shadow type (from count_info) + * + * PGC_SH_none : This page is in the shadow free pool. type_info holds + * the chunk order for our freelist allocator. + * + * PGC_SH_l*_shadow : This page is in use as a shadow. type_info + * holds the mfn of the guest page being shadowed, + * + * PGC_SH_fl1_*_shadow : This page is being used to shatter a superpage. + * type_info holds the gfn being shattered. + * + * PGC_SH_monitor_table : This page is part of a monitor table. + * type_info is not used. + */ + +/* Meaning of the _domain field in shadow pages + * -------------------------------------------- + * + * In shadow pages, this field will always have its least significant bit + * set. This ensures that all attempts to get_page() will fail (as all + * valid pickled domain pointers have a zero for their least significant bit). + * Instead, the remaining upper bits are used to record the shadow generation + * counter when the shadow was created. + */ + +/* Meaning of the shadow_flags field + * ---------------------------------- + * + * In guest pages that are shadowed, one bit for each kind of shadow they have. + * + * In shadow pages, will be used for holding a representation of the populated + * entries in this shadow (either a min/max, or a bitmap, or ...) + * + * In monitor-table pages, holds the level of the particular page (to save + * spilling the shadow types into an extra bit by having three types of monitor + * page). + */ + +/* Meaning of the list_head struct in shadow pages + * ----------------------------------------------- + * + * In free shadow pages, this is used to hold the free-lists of chunks. + * + * In top-level shadow tables, this holds a linked-list of all top-level + * shadows (used for recovering memory and destroying shadows). + * + * In lower-level shadows, this holds the physical address of a higher-level + * shadow entry that holds a reference to this shadow (or zero). + */ + +/* Allocating shadow pages + * ----------------------- + * + * Most shadow pages are allocated singly, but there are two cases where we + * need to allocate multiple pages together. + * + * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows. + * A 32-bit guest l1 table covers 4MB of virtuial address space, + * and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB + * of virtual address space each). Similarly, a 32-bit guest l2 table + * (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va + * each). These multi-page shadows are contiguous and aligned; + * functions for handling offsets into them are defined in shadow.c + * (shadow_l1_index() etc.) + * + * 2: Shadowing PAE top-level pages. Each guest page that contains + * any PAE top-level pages requires two shadow pages to shadow it. + * They contain alternating l3 tables and pae_l3_bookkeeping structs. + * + * This table shows the allocation behaviour of the different modes: + * + * Xen paging 32b pae pae 64b 64b 64b + * Guest paging 32b 32b pae 32b pae 64b + * PV or HVM * HVM * HVM HVM * + * Shadow paging 32b pae pae pae pae 64b + * + * sl1 size 4k 8k 4k 8k 4k 4k + * sl2 size 4k 16k 4k 16k 4k 4k + * sl3 size - - 8k - 8k 4k + * sl4 size - - - - - 4k + * + * We allocate memory from xen in four-page units and break them down + * with a simple buddy allocator. Can't use the xen allocator to handle + * this as it only works for contiguous zones, and a domain's shadow + * pool is made of fragments. + * + * In HVM guests, the p2m table is built out of shadow pages, and we provide + * a function for the p2m management to steal pages, in max-order chunks, from + * the free pool. We don't provide for giving them back, yet. + */ + +/* Figure out the least acceptable quantity of shadow memory. + * The minimum memory requirement for always being able to free up a + * chunk of memory is very small -- only three max-order chunks per + * vcpu to hold the top level shadows and pages with Xen mappings in them. + * + * But for a guest to be guaranteed to successfully execute a single + * instruction, we must be able to map a large number (about thirty) VAs + * at the same time, which means that to guarantee progress, we must + * allow for more than ninety allocated pages per vcpu. We round that + * up to 128 pages, or half a megabyte per vcpu. */ +unsigned int shadow_min_acceptable_pages(struct domain *d) +{ + u32 vcpu_count = 0; + struct vcpu *v; + + for_each_vcpu(d, v) + vcpu_count++; + + return (vcpu_count * 128); +} + +/* Using the type_info field to store freelist order */ +#define SH_PFN_ORDER(_p) ((_p)->u.inuse.type_info) +#define SH_SET_PFN_ORDER(_p, _o) \ + do { (_p)->u.inuse.type_info = (_o); } while (0) + + +/* Figure out the order of allocation needed for a given shadow type */ +static inline u32 +shadow_order(u32 shadow_type) +{ +#if CONFIG_PAGING_LEVELS > 2 + static const u32 type_to_order[16] = { + 0, /* PGC_SH_none */ + 1, /* PGC_SH_l1_32_shadow */ + 1, /* PGC_SH_fl1_32_shadow */ + 2, /* PGC_SH_l2_32_shadow */ + 0, /* PGC_SH_l1_pae_shadow */ + 0, /* PGC_SH_fl1_pae_shadow */ + 0, /* PGC_SH_l2_pae_shadow */ + 0, /* PGC_SH_l2h_pae_shadow */ + 1, /* PGC_SH_l3_pae_shadow */ + 0, /* PGC_SH_l1_64_shadow */ + 0, /* PGC_SH_fl1_64_shadow */ + 0, /* PGC_SH_l2_64_shadow */ + 0, /* PGC_SH_l3_64_shadow */ + 0, /* PGC_SH_l4_64_shadow */ + 2, /* PGC_SH_p2m_table */ + 0 /* PGC_SH_monitor_table */ + }; + u32 type = (shadow_type & PGC_SH_type_mask) >> PGC_SH_type_shift; + return type_to_order[type]; +#else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */ + return 0; +#endif +} + + +/* Do we have a free chunk of at least this order? */ +static inline int chunk_is_available(struct domain *d, int order) +{ + int i; + + for ( i = order; i <= SHADOW_MAX_ORDER; i++ ) + if ( !list_empty(&d->arch.shadow.freelists[i]) ) + return 1; + return 0; +} + +/* Dispatcher function: call the per-mode function that will unhook the + * non-Xen mappings in this top-level shadow mfn */ +void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn) +{ + struct page_info *pg = mfn_to_page(smfn); + switch ( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift ) + { + case PGC_SH_l2_32_shadow >> PGC_SH_type_shift: +#if CONFIG_PAGING_LEVELS == 2 + SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn); +#else + SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn); +#endif + break; +#if CONFIG_PAGING_LEVELS >= 3 + case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn); + break; +#endif +#if CONFIG_PAGING_LEVELS >= 4 + case PGC_SH_l4_64_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn); + break; +#endif + default: + SHADOW_PRINTK("top-level shadow has bad type %08lx\n", + (unsigned long)((pg->count_info & PGC_SH_type_mask) + >> PGC_SH_type_shift)); + BUG(); + } +} + + +/* Make sure there is at least one chunk of the required order available + * in the shadow page pool. This must be called before any calls to + * shadow_alloc(). Since this will free existing shadows to make room, + * it must be called early enough to avoid freeing shadows that the + * caller is currently working on. */ +void shadow_prealloc(struct domain *d, unsigned int order) +{ + /* Need a vpcu for calling unpins; for now, since we don't have + * per-vcpu shadows, any will do */ + struct vcpu *v = d->vcpu[0]; + struct list_head *l, *t; + struct page_info *pg; + mfn_t smfn; + + if ( chunk_is_available(d, order) ) return; + + /* Stage one: walk the list of top-level pages, unpinning them */ + perfc_incrc(shadow_prealloc_1); + list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows) + { + pg = list_entry(l, struct page_info, list); + smfn = page_to_mfn(pg); + +#if CONFIG_PAGING_LEVELS >= 3 + if ( (pg->count_info & PGC_SH_type_mask) == PGC_SH_l3_pae_shadow ) + { + /* For PAE, we need to unpin each subshadow on this shadow */ + SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn); + } + else +#endif /* 32-bit code always takes this branch */ + { + /* Unpin this top-level shadow */ + sh_unpin(v, smfn); + } + + /* See if that freed up a chunk of appropriate size */ + if ( chunk_is_available(d, order) ) return; + } + + /* Stage two: all shadow pages are in use in hierarchies that are + * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen + * mappings. */ + perfc_incrc(shadow_prealloc_2); + v = current; + if ( v->domain != d ) + v = d->vcpu[0]; + /* Walk the list from the tail: recently used toplevels have been pulled + * to the head */ + list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows) + { + pg = list_entry(l, struct page_info, list); + smfn = page_to_mfn(pg); + shadow_unhook_mappings(v, smfn); + + /* Need to flush TLB if we've altered our own tables */ + if ( !shadow_mode_external(d) + && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) ) + local_flush_tlb(); + + /* See if that freed up a chunk of appropriate size */ + if ( chunk_is_available(d, order) ) return; + } + + /* Nothing more we can do: all remaining shadows are of pages that + * hold Xen mappings for some vcpu. This can never happen. */ + SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n" + " shadow pages total = %u, free = %u, p2m=%u\n", + 1 << order, + d->arch.shadow.total_pages, + d->arch.shadow.free_pages, + d->arch.shadow.p2m_pages); + BUG(); +} + + +/* Allocate another shadow's worth of (contiguous, aligned) pages, + * and fill in the type and backpointer fields of their page_infos. + * Never fails to allocate. */ +mfn_t shadow_alloc(struct domain *d, + u32 shadow_type, + unsigned long backpointer) +{ + struct page_info *pg = NULL; + unsigned int order = shadow_order(shadow_type); + cpumask_t mask; + void *p; + int i; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(order <= SHADOW_MAX_ORDER); + ASSERT(shadow_type != PGC_SH_none); + perfc_incrc(shadow_alloc); + + /* Find smallest order which can satisfy the request. */ + for ( i = order; i <= SHADOW_MAX_ORDER; i++ ) + if ( !list_empty(&d->arch.shadow.freelists[i]) ) + { + pg = list_entry(d->arch.shadow.freelists[i].next, + struct page_info, list); + list_del(&pg->list); + + /* We may have to halve the chunk a number of times. */ + while ( i != order ) + { + i--; + SH_SET_PFN_ORDER(pg, i); + list_add_tail(&pg->list, &d->arch.shadow.freelists[i]); + pg += 1 << i; + } + d->arch.shadow.free_pages -= 1 << order; + + /* Init page info fields and clear the pages */ + for ( i = 0; i < 1<domain_dirty_cpumask; + tlbflush_filter(mask, pg[i].tlbflush_timestamp); + if ( unlikely(!cpus_empty(mask)) ) + { + perfc_incrc(shadow_alloc_tlbflush); + flush_tlb_mask(mask); + } + /* Now safe to clear the page for reuse */ + p = sh_map_domain_page(page_to_mfn(pg+i)); + ASSERT(p != NULL); + clear_page(p); + sh_unmap_domain_page(p); + perfc_incr(shadow_alloc_count); + } + return page_to_mfn(pg); + } + + /* If we get here, we failed to allocate. This should never happen. + * It means that we didn't call shadow_prealloc() correctly before + * we allocated. We can't recover by calling prealloc here, because + * we might free up higher-level pages that the caller is working on. */ + SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order); + BUG(); +} + + +/* Return some shadow pages to the pool. */ +void shadow_free(struct domain *d, mfn_t smfn) +{ + struct page_info *pg = mfn_to_page(smfn); + u32 shadow_type; + unsigned long order; + unsigned long mask; + int i; + + ASSERT(shadow_lock_is_acquired(d)); + perfc_incrc(shadow_free); + + shadow_type = pg->count_info & PGC_SH_type_mask; + ASSERT(shadow_type != PGC_SH_none); + ASSERT(shadow_type != PGC_SH_p2m_table); + order = shadow_order(shadow_type); + + d->arch.shadow.free_pages += 1 << order; + + for ( i = 0; i < 1<count_info & PGC_SH_type_mask) != PGT_none) + || (SH_PFN_ORDER(pg-mask) != order) ) + break; + list_del(&(pg-mask)->list); + pg -= mask; + } else { + /* Merge with successor block? */ + if ( (((pg+mask)->count_info & PGC_SH_type_mask) != PGT_none) + || (SH_PFN_ORDER(pg+mask) != order) ) + break; + list_del(&(pg+mask)->list); + } + order++; + } + + SH_SET_PFN_ORDER(pg, order); + list_add_tail(&pg->list, &d->arch.shadow.freelists[order]); +} + +/* Divert some memory from the pool to be used by the p2m mapping. + * This action is irreversible: the p2m mapping only ever grows. + * That's OK because the p2m table only exists for external domains, + * and those domains can't ever turn off shadow mode. + * Also, we only ever allocate a max-order chunk, so as to preserve + * the invariant that shadow_prealloc() always works. + * Returns 0 iff it can't get a chunk (the caller should then + * free up some pages in domheap and call set_sh_allocation); + * returns non-zero on success. + */ +static int +shadow_alloc_p2m_pages(struct domain *d) +{ + struct page_info *pg; + u32 i; + ASSERT(shadow_lock_is_acquired(d)); + + if ( d->arch.shadow.total_pages + < (shadow_min_acceptable_pages(d) + (1<arch.shadow.p2m_pages += (1<arch.shadow.total_pages -= (1<arch.shadow.p2m_freelist); + } + return 1; +} + +// Returns 0 if no memory is available... +mfn_t +shadow_alloc_p2m_page(struct domain *d) +{ + struct list_head *entry; + mfn_t mfn; + void *p; + + if ( list_empty(&d->arch.shadow.p2m_freelist) && + !shadow_alloc_p2m_pages(d) ) + return _mfn(0); + entry = d->arch.shadow.p2m_freelist.next; + list_del(entry); + list_add_tail(entry, &d->arch.shadow.p2m_inuse); + mfn = page_to_mfn(list_entry(entry, struct page_info, list)); + sh_get_ref(mfn, 0); + p = sh_map_domain_page(mfn); + clear_page(p); + sh_unmap_domain_page(p); + + return mfn; +} + +#if CONFIG_PAGING_LEVELS == 3 +static void p2m_install_entry_in_monitors(struct domain *d, + l3_pgentry_t *l3e) +/* Special case, only used for external-mode domains on PAE hosts: + * update the mapping of the p2m table. Once again, this is trivial in + * other paging modes (one top-level entry points to the top-level p2m, + * no maintenance needed), but PAE makes life difficult by needing a + * copy the eight l3es of the p2m table in eight l2h slots in the + * monitor table. This function makes fresh copies when a p2m l3e + * changes. */ +{ + l2_pgentry_t *ml2e; + struct vcpu *v; + unsigned int index; + + index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t); + ASSERT(index < MACHPHYS_MBYTES>>1); + + for_each_vcpu(d, v) + { + if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) + continue; + ASSERT(shadow_mode_external(v->domain)); + + SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n", + d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e)); + + if ( v == current ) /* OK to use linear map of monitor_table */ + ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START); + else + { + l3_pgentry_t *ml3e; + ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); + ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT); + ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3]))); + ml2e += l2_table_offset(RO_MPT_VIRT_START); + sh_unmap_domain_page(ml3e); + } + ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR); + if ( v != current ) + sh_unmap_domain_page(ml2e); + } +} +#endif + +// Find the next level's P2M entry, checking for out-of-range gfn's... +// Returns NULL on error. +// +static l1_pgentry_t * +p2m_find_entry(void *table, unsigned long *gfn_remainder, + unsigned long gfn, u32 shift, u32 max) +{ + u32 index; + + index = *gfn_remainder >> shift; + if ( index >= max ) + { + SHADOW_DEBUG(P2M, "gfn=0x%lx out of range " + "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n", + gfn, *gfn_remainder, shift, index, max); + return NULL; + } + *gfn_remainder &= (1 << shift) - 1; + return (l1_pgentry_t *)table + index; +} + +// Walk one level of the P2M table, allocating a new table if required. +// Returns 0 on error. +// +static int +p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table, + unsigned long *gfn_remainder, unsigned long gfn, u32 shift, + u32 max, unsigned long type) +{ + l1_pgentry_t *p2m_entry; + void *next; + + if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, + shift, max)) ) + return 0; + + if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) ) + { + mfn_t mfn = shadow_alloc_p2m_page(d); + if ( mfn_x(mfn) == 0 ) + return 0; + *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER); + mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated; + mfn_to_page(mfn)->count_info = 1; +#if CONFIG_PAGING_LEVELS == 3 + if (type == PGT_l2_page_table) + { + /* We have written to the p2m l3: need to sync the per-vcpu + * copies of it in the monitor tables */ + p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry); + } +#endif + /* The P2M can be shadowed: keep the shadows synced */ + if ( d->vcpu[0] ) + (void)__shadow_validate_guest_entry(d->vcpu[0], *table_mfn, + p2m_entry, sizeof *p2m_entry); + } + *table_mfn = _mfn(l1e_get_pfn(*p2m_entry)); + next = sh_map_domain_page(*table_mfn); + sh_unmap_domain_page(*table); + *table = next; + + return 1; +} + +// Returns 0 on error (out of memory) +int +shadow_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) +{ + // XXX -- this might be able to be faster iff current->domain == d + mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table); + void *table = sh_map_domain_page(table_mfn); + unsigned long gfn_remainder = gfn; + l1_pgentry_t *p2m_entry; + +#if CONFIG_PAGING_LEVELS >= 4 + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L4_PAGETABLE_SHIFT - PAGE_SHIFT, + L4_PAGETABLE_ENTRIES, PGT_l3_page_table) ) + return 0; +#endif +#if CONFIG_PAGING_LEVELS >= 3 + // When using PAE Xen, we only allow 33 bits of pseudo-physical + // address in translated guests (i.e. 8 GBytes). This restriction + // comes from wanting to map the P2M table into the 16MB RO_MPT hole + // in Xen's address space for translated PV guests. + // + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L3_PAGETABLE_SHIFT - PAGE_SHIFT, + (CONFIG_PAGING_LEVELS == 3 + ? 8 + : L3_PAGETABLE_ENTRIES), + PGT_l2_page_table) ) + return 0; +#endif + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L2_PAGETABLE_SHIFT - PAGE_SHIFT, + L2_PAGETABLE_ENTRIES, PGT_l1_page_table) ) + return 0; + + p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, + 0, L1_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + if ( valid_mfn(mfn) ) + *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER); + else + *p2m_entry = l1e_empty(); + + /* The P2M can be shadowed: keep the shadows synced */ + (void) __shadow_validate_guest_entry(d->vcpu[0], table_mfn, + p2m_entry, sizeof *p2m_entry); + + sh_unmap_domain_page(table); + + return 1; +} + +// Allocate a new p2m table for a domain. +// +// The structure of the p2m table is that of a pagetable for xen (i.e. it is +// controlled by CONFIG_PAGING_LEVELS). +// +// Returns 0 if p2m table could not be initialized +// +static int +shadow_alloc_p2m_table(struct domain *d) +{ + mfn_t p2m_top; + struct list_head *entry; + unsigned int page_count = 0; + + SHADOW_PRINTK("allocating p2m table\n"); + ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0); + + p2m_top = shadow_alloc_p2m_page(d); + mfn_to_page(p2m_top)->count_info = 1; + mfn_to_page(p2m_top)->u.inuse.type_info = +#if CONFIG_PAGING_LEVELS == 4 + PGT_l4_page_table +#elif CONFIG_PAGING_LEVELS == 3 + PGT_l3_page_table +#elif CONFIG_PAGING_LEVELS == 2 + PGT_l2_page_table +#endif + | 1 | PGT_validated; + + if ( mfn_x(p2m_top) == 0 ) + return 0; + + d->arch.phys_table = pagetable_from_mfn(p2m_top); + + SHADOW_PRINTK("populating p2m table\n"); + + for ( entry = d->page_list.next; + entry != &d->page_list; + entry = entry->next ) + { + struct page_info *page = list_entry(entry, struct page_info, list); + mfn_t mfn = page_to_mfn(page); + unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn)); + page_count++; + if ( +#ifdef __x86_64__ + (gfn != 0x5555555555555555L) +#else + (gfn != 0x55555555L) +#endif + && gfn != INVALID_M2P_ENTRY + && !shadow_set_p2m_entry(d, gfn, mfn) ) + { + SHADOW_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" SH_PRI_mfn "\n", + gfn, mfn_x(mfn)); + return 0; + } + } + + SHADOW_PRINTK("p2m table initialised (%u pages)\n", page_count); + return 1; +} + +mfn_t +sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn) +/* Read another domain's p2m entries */ +{ + mfn_t mfn; + unsigned long addr = gpfn << PAGE_SHIFT; + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + + ASSERT(shadow_mode_translate(d)); + mfn = pagetable_get_mfn(d->arch.phys_table); + + +#if CONFIG_PAGING_LEVELS > 2 + if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) + /* This pfn is higher than the p2m map can hold */ + return _mfn(INVALID_MFN); +#endif + + +#if CONFIG_PAGING_LEVELS >= 4 + { + l4_pgentry_t *l4e = sh_map_domain_page(mfn); + l4e += l4_table_offset(addr); + if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 ) + { + sh_unmap_domain_page(l4e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l4e_get_pfn(*l4e)); + sh_unmap_domain_page(l4e); + } +#endif +#if CONFIG_PAGING_LEVELS >= 3 + { + l3_pgentry_t *l3e = sh_map_domain_page(mfn); + l3e += l3_table_offset(addr); + if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) + { + sh_unmap_domain_page(l3e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l3e_get_pfn(*l3e)); + sh_unmap_domain_page(l3e); + } +#endif + + l2e = sh_map_domain_page(mfn); + l2e += l2_table_offset(addr); + if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) + { + sh_unmap_domain_page(l2e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l2e_get_pfn(*l2e)); + sh_unmap_domain_page(l2e); + + l1e = sh_map_domain_page(mfn); + l1e += l1_table_offset(addr); + if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 ) + { + sh_unmap_domain_page(l1e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l1e_get_pfn(*l1e)); + sh_unmap_domain_page(l1e); + + return mfn; +} + +unsigned long +shadow_gfn_to_mfn_foreign(unsigned long gpfn) +{ + return mfn_x(sh_gfn_to_mfn_foreign(current->domain, gpfn)); +} + + +static void shadow_p2m_teardown(struct domain *d) +/* Return all the p2m pages to Xen. + * We know we don't have any extra mappings to these pages */ +{ + struct list_head *entry, *n; + struct page_info *pg; + + d->arch.phys_table = pagetable_null(); + + list_for_each_safe(entry, n, &d->arch.shadow.p2m_inuse) + { + pg = list_entry(entry, struct page_info, list); + list_del(entry); + /* Should have just the one ref we gave it in alloc_p2m_page() */ + if ( (pg->count_info & PGC_SH_count_mask) != 1 ) + { + SHADOW_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n", + pg->count_info, pg->u.inuse.type_info); + } + ASSERT(page_get_owner(pg) == d); + /* Free should not decrement domain's total allocation, since + * these pages were allocated without an owner. */ + page_set_owner(pg, NULL); + free_domheap_pages(pg, 0); + d->arch.shadow.p2m_pages--; + perfc_decr(shadow_alloc_count); + } + list_for_each_safe(entry, n, &d->arch.shadow.p2m_freelist) + { + list_del(entry); + pg = list_entry(entry, struct page_info, list); + ASSERT(page_get_owner(pg) == d); + /* Free should not decrement domain's total allocation. */ + page_set_owner(pg, NULL); + free_domheap_pages(pg, 0); + d->arch.shadow.p2m_pages--; + perfc_decr(shadow_alloc_count); + } + ASSERT(d->arch.shadow.p2m_pages == 0); +} + +/* Set the pool of shadow pages to the required number of pages. + * Input will be rounded up to at least shadow_min_acceptable_pages(), + * plus space for the p2m table. + * Returns 0 for success, non-zero for failure. */ +static unsigned int set_sh_allocation(struct domain *d, + unsigned int pages, + int *preempted) +{ + struct page_info *pg; + unsigned int lower_bound; + int j; + + ASSERT(shadow_lock_is_acquired(d)); + + /* Don't allocate less than the minimum acceptable, plus one page per + * megabyte of RAM (for the p2m table) */ + lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256); + if ( pages > 0 && pages < lower_bound ) + pages = lower_bound; + /* Round up to largest block size */ + pages = (pages + ((1<arch.shadow.total_pages, pages); + + while ( d->arch.shadow.total_pages != pages ) + { + if ( d->arch.shadow.total_pages < pages ) + { + /* Need to allocate more memory from domheap */ + pg = alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0); + if ( pg == NULL ) + { + SHADOW_PRINTK("failed to allocate shadow pages.\n"); + return -ENOMEM; + } + d->arch.shadow.free_pages += 1<arch.shadow.total_pages += 1<list, + &d->arch.shadow.freelists[SHADOW_MAX_ORDER]); + } + else if ( d->arch.shadow.total_pages > pages ) + { + /* Need to return memory to domheap */ + shadow_prealloc(d, SHADOW_MAX_ORDER); + ASSERT(!list_empty(&d->arch.shadow.freelists[SHADOW_MAX_ORDER])); + pg = list_entry(d->arch.shadow.freelists[SHADOW_MAX_ORDER].next, + struct page_info, list); + list_del(&pg->list); + d->arch.shadow.free_pages -= 1<arch.shadow.total_pages -= 1<domain_id, + d->arch.shadow.total_pages, + shadow_get_allocation(d)); + shadow_unlock(d); + return rv; +} + +/**************************************************************************/ +/* Hash table for storing the guest->shadow mappings */ + +/* Hash function that takes a gfn or mfn, plus another byte of type info */ +typedef u32 key_t; +static inline key_t sh_hash(unsigned long n, u8 t) +{ + unsigned char *p = (unsigned char *)&n; + key_t k = t; + int i; + for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k; + return k; +} + +#if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL) + +/* Before we get to the mechanism, define a pair of audit functions + * that sanity-check the contents of the hash table. */ +static void sh_hash_audit_bucket(struct domain *d, int bucket) +/* Audit one bucket of the hash table */ +{ + struct shadow_hash_entry *e, *x; + struct page_info *pg; + + if ( !(SHADOW_AUDIT_ENABLE) ) + return; + + e = &d->arch.shadow.hash_table[bucket]; + if ( e->t == 0 ) return; /* Bucket is empty */ + while ( e ) + { + /* Empty link? */ + BUG_ON( e->t == 0 ); + /* Bogus type? */ + BUG_ON( e->t > (PGC_SH_max_shadow >> PGC_SH_type_shift) ); + /* Wrong bucket? */ + BUG_ON( sh_hash(e->n, e->t) % SHADOW_HASH_BUCKETS != bucket ); + /* Duplicate entry? */ + for ( x = e->next; x; x = x->next ) + BUG_ON( x->n == e->n && x->t == e->t ); + /* Bogus MFN? */ + BUG_ON( !valid_mfn(e->smfn) ); + pg = mfn_to_page(e->smfn); + /* Not a shadow? */ + BUG_ON( page_get_owner(pg) != 0 ); + /* Wrong kind of shadow? */ + BUG_ON( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift + != e->t ); + /* Bad backlink? */ + BUG_ON( pg->u.inuse.type_info != e->n ); + if ( e->t != (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift) + && e->t != (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift) + && e->t != (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift) ) + { + /* Bad shadow flags on guest page? */ + BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow_flags & (1<t)) ); + } + /* That entry was OK; on we go */ + e = e->next; + } +} + +#else +#define sh_hash_audit_bucket(_d, _b) +#endif /* Hashtable bucket audit */ + + +#if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL + +static void sh_hash_audit(struct domain *d) +/* Full audit: audit every bucket in the table */ +{ + int i; + + if ( !(SHADOW_AUDIT_ENABLE) ) + return; + + for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) + { + sh_hash_audit_bucket(d, i); + } +} + +#else +#define sh_hash_audit(_d) +#endif /* Hashtable bucket audit */ + +/* Memory management interface for bucket allocation. + * These ought to come out of shadow memory, but at least on 32-bit + * machines we are forced to allocate them from xenheap so that we can + * address them. */ +static struct shadow_hash_entry *sh_alloc_hash_entry(struct domain *d) +{ + struct shadow_hash_entry *extra, *x; + int i; + + /* We need to allocate a new node. Ensure the free list is not empty. + * Allocate new entries in units the same size as the original table. */ + if ( unlikely(d->arch.shadow.hash_freelist == NULL) ) + { + size_t sz = sizeof(void *) + (SHADOW_HASH_BUCKETS * sizeof(*x)); + extra = xmalloc_bytes(sz); + + if ( extra == NULL ) + { + /* No memory left! */ + SHADOW_ERROR("xmalloc() failed when allocating hash buckets.\n"); + domain_crash_synchronous(); + } + memset(extra, 0, sz); + + /* Record the allocation block so it can be correctly freed later. */ + *((struct shadow_hash_entry **)&extra[SHADOW_HASH_BUCKETS]) = + d->arch.shadow.hash_allocations; + d->arch.shadow.hash_allocations = &extra[0]; + + /* Thread a free chain through the newly-allocated nodes. */ + for ( i = 0; i < (SHADOW_HASH_BUCKETS - 1); i++ ) + extra[i].next = &extra[i+1]; + extra[i].next = NULL; + + /* Add the new nodes to the free list. */ + d->arch.shadow.hash_freelist = &extra[0]; + } + + /* Allocate a new node from the free list. */ + x = d->arch.shadow.hash_freelist; + d->arch.shadow.hash_freelist = x->next; + return x; +} + +static void sh_free_hash_entry(struct domain *d, struct shadow_hash_entry *e) +{ + /* Mark the bucket as empty and return it to the free list */ + e->t = 0; + e->next = d->arch.shadow.hash_freelist; + d->arch.shadow.hash_freelist = e; +} + + +/* Allocate and initialise the table itself. + * Returns 0 for success, 1 for error. */ +static int shadow_hash_alloc(struct domain *d) +{ + struct shadow_hash_entry *table; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(!d->arch.shadow.hash_table); + + table = xmalloc_array(struct shadow_hash_entry, SHADOW_HASH_BUCKETS); + if ( !table ) return 1; + memset(table, 0, + SHADOW_HASH_BUCKETS * sizeof (struct shadow_hash_entry)); + d->arch.shadow.hash_table = table; + return 0; +} + +/* Tear down the hash table and return all memory to Xen. + * This function does not care whether the table is populated. */ +static void shadow_hash_teardown(struct domain *d) +{ + struct shadow_hash_entry *a, *n; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(d->arch.shadow.hash_table); + + /* Return the table itself */ + xfree(d->arch.shadow.hash_table); + d->arch.shadow.hash_table = NULL; + + /* Return any extra allocations */ + a = d->arch.shadow.hash_allocations; + while ( a ) + { + /* We stored a linked-list pointer at the end of each allocation */ + n = *((struct shadow_hash_entry **)(&a[SHADOW_HASH_BUCKETS])); + xfree(a); + a = n; + } + d->arch.shadow.hash_allocations = NULL; + d->arch.shadow.hash_freelist = NULL; +} + + +mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, u8 t) +/* Find an entry in the hash table. Returns the MFN of the shadow, + * or INVALID_MFN if it doesn't exist */ +{ + struct domain *d = v->domain; + struct shadow_hash_entry *p, *x, *head; + key_t key; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(d->arch.shadow.hash_table); + ASSERT(t); + + sh_hash_audit(d); + + perfc_incrc(shadow_hash_lookups); + key = sh_hash(n, t); + + x = head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS]; + p = NULL; + + sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS); + + do + { + ASSERT(x->t || ((x == head) && (x->next == NULL))); + + if ( x->n == n && x->t == t ) + { + /* Pull-to-front if 'x' isn't already the head item */ + if ( unlikely(x != head) ) + { + if ( unlikely(d->arch.shadow.hash_walking != 0) ) + /* Can't reorder: someone is walking the hash chains */ + return x->smfn; + else + { + /* Delete 'x' from list and reinsert after head. */ + p->next = x->next; + x->next = head->next; + head->next = x; + + /* Swap 'x' contents with head contents. */ + SWAP(head->n, x->n); + SWAP(head->t, x->t); + SWAP(head->smfn, x->smfn); + } + } + else + { + perfc_incrc(shadow_hash_lookup_head); + } + return head->smfn; + } + + p = x; + x = x->next; + } + while ( x != NULL ); + + perfc_incrc(shadow_hash_lookup_miss); + return _mfn(INVALID_MFN); +} + +void shadow_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn) +/* Put a mapping (n,t)->smfn into the hash table */ +{ + struct domain *d = v->domain; + struct shadow_hash_entry *x, *head; + key_t key; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(d->arch.shadow.hash_table); + ASSERT(t); + + sh_hash_audit(d); + + perfc_incrc(shadow_hash_inserts); + key = sh_hash(n, t); + + head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS]; + + sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS); + + /* If the bucket is empty then insert the new page as the head item. */ + if ( head->t == 0 ) + { + head->n = n; + head->t = t; + head->smfn = smfn; + ASSERT(head->next == NULL); + } + else + { + /* Insert a new entry directly after the head item. */ + x = sh_alloc_hash_entry(d); + x->n = n; + x->t = t; + x->smfn = smfn; + x->next = head->next; + head->next = x; + } + + sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS); +} + +void shadow_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn) +/* Excise the mapping (n,t)->smfn from the hash table */ +{ + struct domain *d = v->domain; + struct shadow_hash_entry *p, *x, *head; + key_t key; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(d->arch.shadow.hash_table); + ASSERT(t); + + sh_hash_audit(d); + + perfc_incrc(shadow_hash_deletes); + key = sh_hash(n, t); + + head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS]; + + sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS); + + /* Match on head item? */ + if ( head->n == n && head->t == t ) + { + if ( (x = head->next) != NULL ) + { + /* Overwrite head with contents of following node. */ + head->n = x->n; + head->t = x->t; + head->smfn = x->smfn; + + /* Delete following node. */ + head->next = x->next; + sh_free_hash_entry(d, x); + } + else + { + /* This bucket is now empty. Initialise the head node. */ + head->t = 0; + } + } + else + { + /* Not at the head; need to walk the chain */ + p = head; + x = head->next; + + while(1) + { + ASSERT(x); /* We can't have hit the end, since our target is + * still in the chain somehwere... */ + if ( x->n == n && x->t == t ) + { + /* Delete matching node. */ + p->next = x->next; + sh_free_hash_entry(d, x); + break; + } + p = x; + x = x->next; + } + } + + sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS); +} + +typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn); + +static void hash_foreach(struct vcpu *v, + unsigned int callback_mask, + hash_callback_t callbacks[], + mfn_t callback_mfn) +/* Walk the hash table looking at the types of the entries and + * calling the appropriate callback function for each entry. + * The mask determines which shadow types we call back for, and the array + * of callbacks tells us which function to call. + * Any callback may return non-zero to let us skip the rest of the scan. + * + * WARNING: Callbacks MUST NOT add or remove hash entries unless they + * then return non-zero to terminate the scan. */ +{ + int i, done = 0; + struct domain *d = v->domain; + struct shadow_hash_entry *x; + + /* Say we're here, to stop hash-lookups reordering the chains */ + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(d->arch.shadow.hash_walking == 0); + d->arch.shadow.hash_walking = 1; + + callback_mask &= ~1; /* Never attempt to call back on empty buckets */ + for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) + { + /* WARNING: This is not safe against changes to the hash table. + * The callback *must* return non-zero if it has inserted or + * deleted anything from the hash (lookups are OK, though). */ + for ( x = &d->arch.shadow.hash_table[i]; x; x = x->next ) + { + if ( callback_mask & (1 << x->t) ) + { + ASSERT(x->t <= 15); + ASSERT(callbacks[x->t] != NULL); + if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 ) + break; + } + } + if ( done ) break; + } + d->arch.shadow.hash_walking = 0; +} + + +/**************************************************************************/ +/* Destroy a shadow page: simple dispatcher to call the per-type destructor + * which will decrement refcounts appropriately and return memory to the + * free pool. */ + +void sh_destroy_shadow(struct vcpu *v, mfn_t smfn) +{ + struct page_info *pg = mfn_to_page(smfn); + u32 t = pg->count_info & PGC_SH_type_mask; + + + SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn)); + + /* Double-check, if we can, that the shadowed page belongs to this + * domain, (by following the back-pointer). */ + ASSERT(t == PGC_SH_fl1_32_shadow || + t == PGC_SH_fl1_pae_shadow || + t == PGC_SH_fl1_64_shadow || + t == PGC_SH_monitor_table || + (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info))) + == v->domain)); + + /* The down-shifts here are so that the switch statement is on nice + * small numbers that the compiler will enjoy */ + switch ( t >> PGC_SH_type_shift ) + { +#if CONFIG_PAGING_LEVELS == 2 + case PGC_SH_l1_32_shadow >> PGC_SH_type_shift: + case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn); + break; + case PGC_SH_l2_32_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn); + break; +#else /* PAE or 64bit */ + case PGC_SH_l1_32_shadow >> PGC_SH_type_shift: + case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn); + break; + case PGC_SH_l2_32_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn); + break; +#endif + +#if CONFIG_PAGING_LEVELS >= 3 + case PGC_SH_l1_pae_shadow >> PGC_SH_type_shift: + case PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn); + break; + case PGC_SH_l2_pae_shadow >> PGC_SH_type_shift: + case PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn); + break; + case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 3, 3)(v, smfn); + break; +#endif + +#if CONFIG_PAGING_LEVELS >= 4 + case PGC_SH_l1_64_shadow >> PGC_SH_type_shift: + case PGC_SH_fl1_64_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn); + break; + case PGC_SH_l2_64_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn); + break; + case PGC_SH_l3_64_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn); + break; + case PGC_SH_l4_64_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn); + break; +#endif + default: + SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n", + (unsigned long)t); + BUG(); + } +} + +/**************************************************************************/ +/* Remove all writeable mappings of a guest frame from the shadow tables + * Returns non-zero if we need to flush TLBs. + * level and fault_addr desribe how we found this to be a pagetable; + * level==0 means we have some other reason for revoking write access.*/ + +int shadow_remove_write_access(struct vcpu *v, mfn_t gmfn, + unsigned int level, + unsigned long fault_addr) +{ + /* Dispatch table for getting per-type functions */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* l1_32 */ + SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* fl1_32 */ +#else + SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* l1_32 */ + SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* fl1_32 */ +#endif + NULL, /* l2_32 */ +#if CONFIG_PAGING_LEVELS >= 3 + SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* l1_pae */ + SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* fl1_pae */ +#else + NULL, /* l1_pae */ + NULL, /* fl1_pae */ +#endif + NULL, /* l2_pae */ + NULL, /* l2h_pae */ + NULL, /* l3_pae */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* l1_64 */ + SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* fl1_64 */ +#else + NULL, /* l1_64 */ + NULL, /* fl1_64 */ +#endif + NULL, /* l2_64 */ + NULL, /* l3_64 */ + NULL, /* l4_64 */ + NULL, /* p2m */ + NULL /* unused */ + }; + + static unsigned int callback_mask = + 1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift) + ; + struct page_info *pg = mfn_to_page(gmfn); + + ASSERT(shadow_lock_is_acquired(v->domain)); + + /* Only remove writable mappings if we are doing shadow refcounts. + * In guest refcounting, we trust Xen to already be restricting + * all the writes to the guest page tables, so we do not need to + * do more. */ + if ( !shadow_mode_refcounts(v->domain) ) + return 0; + + /* Early exit if it's already a pagetable, or otherwise not writeable */ + if ( sh_mfn_is_a_page_table(gmfn) + || (pg->u.inuse.type_info & PGT_count_mask) == 0 ) + return 0; + + perfc_incrc(shadow_writeable); + + /* If this isn't a "normal" writeable page, the domain is trying to + * put pagetables in special memory of some kind. We can't allow that. */ + if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page ) + { + SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %" + PRtype_info "\n", + mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info); + domain_crash(v->domain); + } + +#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC + if ( v == current && level != 0 ) + { + unsigned long gfn; + /* Heuristic: there is likely to be only one writeable mapping, + * and that mapping is likely to be in the current pagetable, + * either in the guest's linear map (linux, windows) or in a + * magic slot used to map high memory regions (linux HIGHTPTE) */ + +#define GUESS(_a, _h) do { \ + if ( v->arch.shadow.mode->guess_wrmap(v, (_a), gmfn) ) \ + perfc_incrc(shadow_writeable_h_ ## _h); \ + if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \ + return 1; \ + } while (0) + + + /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */ + if ( v == current + && (gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 ) + GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4); + + if ( v->arch.shadow.mode->guest_levels == 2 ) + { + if ( level == 1 ) + /* 32bit non-PAE w2k3: linear map at 0xC0000000 */ + GUESS(0xC0000000UL + (fault_addr >> 10), 1); + } +#if CONFIG_PAGING_LEVELS >= 3 + else if ( v->arch.shadow.mode->guest_levels == 3 ) + { + /* 32bit PAE w2k3: linear map at 0xC0000000 */ + switch ( level ) + { + case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break; + case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break; + } + } +#if CONFIG_PAGING_LEVELS >= 4 + else if ( v->arch.shadow.mode->guest_levels == 4 ) + { + /* 64bit w2k3: linear map at 0x0000070000000000 */ + switch ( level ) + { + case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break; + case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break; + case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break; + } + } +#endif /* CONFIG_PAGING_LEVELS >= 4 */ +#endif /* CONFIG_PAGING_LEVELS >= 3 */ + +#undef GUESS + + } +#endif + + /* Brute-force search of all the shadows, by walking the hash */ + perfc_incrc(shadow_writeable_bf); + hash_foreach(v, callback_mask, callbacks, gmfn); + + /* If that didn't catch the mapping, something is very wrong */ + if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 ) + { + SHADOW_ERROR("can't find all writeable mappings of mfn %lx: " + "%lu left\n", mfn_x(gmfn), + (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask)); + domain_crash(v->domain); + } + + /* We killed at least one writeable mapping, so must flush TLBs. */ + return 1; +} + + + +/**************************************************************************/ +/* Remove all mappings of a guest frame from the shadow tables. + * Returns non-zero if we need to flush TLBs. */ + +int shadow_remove_all_mappings(struct vcpu *v, mfn_t gmfn) +{ + struct page_info *page = mfn_to_page(gmfn); + int expected_count; + + /* Dispatch table for getting per-type functions */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* l1_32 */ + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* fl1_32 */ +#else + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* l1_32 */ + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* fl1_32 */ +#endif + NULL, /* l2_32 */ +#if CONFIG_PAGING_LEVELS >= 3 + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* l1_pae */ + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* fl1_pae */ +#else + NULL, /* l1_pae */ + NULL, /* fl1_pae */ +#endif + NULL, /* l2_pae */ + NULL, /* l2h_pae */ + NULL, /* l3_pae */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* l1_64 */ + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* fl1_64 */ +#else + NULL, /* l1_64 */ + NULL, /* fl1_64 */ +#endif + NULL, /* l2_64 */ + NULL, /* l3_64 */ + NULL, /* l4_64 */ + NULL, /* p2m */ + NULL /* unused */ + }; + + static unsigned int callback_mask = + 1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift) + ; + + perfc_incrc(shadow_mappings); + if ( (page->count_info & PGC_count_mask) == 0 ) + return 0; + + ASSERT(shadow_lock_is_acquired(v->domain)); + + /* XXX TODO: + * Heuristics for finding the (probably) single mapping of this gmfn */ + + /* Brute-force search of all the shadows, by walking the hash */ + perfc_incrc(shadow_mappings_bf); + hash_foreach(v, callback_mask, callbacks, gmfn); + + /* If that didn't catch the mapping, something is very wrong */ + expected_count = (page->count_info & PGC_allocated) ? 1 : 0; + if ( (page->count_info & PGC_count_mask) != expected_count ) + { + /* Don't complain if we're in HVM and there's one extra mapping: + * The qemu helper process has an untyped mapping of this dom's RAM */ + if ( !(shadow_mode_external(v->domain) + && (page->count_info & PGC_count_mask) <= 2 + && (page->u.inuse.type_info & PGT_count_mask) == 0) ) + { + SHADOW_ERROR("can't find all mappings of mfn %lx: " + "c=%08x t=%08lx\n", mfn_x(gmfn), + page->count_info, page->u.inuse.type_info); + } + } + + /* We killed at least one mapping, so must flush TLBs. */ + return 1; +} + + +/**************************************************************************/ +/* Remove all shadows of a guest frame from the shadow tables */ + +static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn) +/* Follow this shadow's up-pointer, if it has one, and remove the reference + * found there. Returns 1 if that was the only reference to this shadow */ +{ + struct page_info *pg = mfn_to_page(smfn); + mfn_t pmfn; + void *vaddr; + int rc; + + ASSERT((pg->count_info & PGC_SH_type_mask) > 0); + ASSERT((pg->count_info & PGC_SH_type_mask) < PGC_SH_max_shadow); + ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l2_32_shadow); + ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l3_pae_shadow); + ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l4_64_shadow); + + if (pg->up == 0) return 0; + pmfn = _mfn(pg->up >> PAGE_SHIFT); + ASSERT(valid_mfn(pmfn)); + vaddr = sh_map_domain_page(pmfn); + ASSERT(vaddr); + vaddr += pg->up & (PAGE_SIZE-1); + ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn)); + + /* Is this the only reference to this shadow? */ + rc = ((pg->count_info & PGC_SH_count_mask) == 1) ? 1 : 0; + + /* Blank the offending entry */ + switch ((pg->count_info & PGC_SH_type_mask)) + { + case PGC_SH_l1_32_shadow: + case PGC_SH_l2_32_shadow: +#if CONFIG_PAGING_LEVELS == 2 + SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn); +#else + SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn); +#endif + break; +#if CONFIG_PAGING_LEVELS >=3 + case PGC_SH_l1_pae_shadow: + case PGC_SH_l2_pae_shadow: + case PGC_SH_l2h_pae_shadow: + case PGC_SH_l3_pae_shadow: + SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn); + break; +#if CONFIG_PAGING_LEVELS >= 4 + case PGC_SH_l1_64_shadow: + case PGC_SH_l2_64_shadow: + case PGC_SH_l3_64_shadow: + case PGC_SH_l4_64_shadow: + SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn); + break; +#endif +#endif + default: BUG(); /* Some wierd unknown shadow type */ + } + + sh_unmap_domain_page(vaddr); + if ( rc ) + perfc_incrc(shadow_up_pointer); + else + perfc_incrc(shadow_unshadow_bf); + + return rc; +} + +void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int all) +/* Remove the shadows of this guest page. + * If all != 0, find all shadows, if necessary by walking the tables. + * Otherwise, just try the (much faster) heuristics, which will remove + * at most one reference to each shadow of the page. */ +{ + struct page_info *pg; + mfn_t smfn; + u32 sh_flags; + unsigned char t; + + /* Dispatch table for getting per-type functions: each level must + * be called with the function to remove a lower-level shadow. */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ + NULL, /* l1_32 */ + NULL, /* fl1_32 */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32 */ +#else + SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32 */ +#endif + NULL, /* l1_pae */ + NULL, /* fl1_pae */ +#if CONFIG_PAGING_LEVELS >= 3 + SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae */ + SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */ + SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,3,3), /* l3_pae */ +#else + NULL, /* l2_pae */ + NULL, /* l2h_pae */ + NULL, /* l3_pae */ +#endif + NULL, /* l1_64 */ + NULL, /* fl1_64 */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64 */ + SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64 */ + SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64 */ +#else + NULL, /* l2_64 */ + NULL, /* l3_64 */ + NULL, /* l4_64 */ +#endif + NULL, /* p2m */ + NULL /* unused */ + }; + + /* Another lookup table, for choosing which mask to use */ + static unsigned int masks[16] = { + 0, /* none */ + 1 << (PGC_SH_l2_32_shadow >> PGC_SH_type_shift), /* l1_32 */ + 0, /* fl1_32 */ + 0, /* l2_32 */ + ((1 << (PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift)) + | (1 << (PGC_SH_l2_pae_shadow >> PGC_SH_type_shift))), /* l1_pae */ + 0, /* fl1_pae */ + 1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2_pae */ + 1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2h_pae */ + 0, /* l3_pae */ + 1 << (PGC_SH_l2_64_shadow >> PGC_SH_type_shift), /* l1_64 */ + 0, /* fl1_64 */ + 1 << (PGC_SH_l3_64_shadow >> PGC_SH_type_shift), /* l2_64 */ + 1 << (PGC_SH_l4_64_shadow >> PGC_SH_type_shift), /* l3_64 */ + 0, /* l4_64 */ + 0, /* p2m */ + 0 /* unused */ + }; + + ASSERT(shadow_lock_is_acquired(v->domain)); + + pg = mfn_to_page(gmfn); + + /* Bale out now if the page is not shadowed */ + if ( (pg->count_info & PGC_page_table) == 0 ) + return; + + SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n", + v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); + + /* Search for this shadow in all appropriate shadows */ + perfc_incrc(shadow_unshadow); + sh_flags = pg->shadow_flags; + + /* Lower-level shadows need to be excised from upper-level shadows. + * This call to hash_foreach() looks dangerous but is in fact OK: each + * call will remove at most one shadow, and terminate immediately when + * it does remove it, so we never walk the hash after doing a deletion. */ +#define DO_UNSHADOW(_type) do { \ + t = (_type) >> PGC_SH_type_shift; \ + smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \ + if ( !sh_remove_shadow_via_pointer(v, smfn) && all ) \ + hash_foreach(v, masks[t], callbacks, smfn); \ +} while (0) + + /* Top-level shadows need to be unpinned */ +#define DO_UNPIN(_type) do { \ + t = (_type) >> PGC_SH_type_shift; \ + smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \ + if ( mfn_to_page(smfn)->count_info & PGC_SH_pinned ) \ + sh_unpin(v, smfn); \ + if ( (_type) == PGC_SH_l3_pae_shadow ) \ + SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn); \ +} while (0) + + if ( sh_flags & SHF_L1_32 ) DO_UNSHADOW(PGC_SH_l1_32_shadow); + if ( sh_flags & SHF_L2_32 ) DO_UNPIN(PGC_SH_l2_32_shadow); +#if CONFIG_PAGING_LEVELS >= 3 + if ( sh_flags & SHF_L1_PAE ) DO_UNSHADOW(PGC_SH_l1_pae_shadow); + if ( sh_flags & SHF_L2_PAE ) DO_UNSHADOW(PGC_SH_l2_pae_shadow); + if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(PGC_SH_l2h_pae_shadow); + if ( sh_flags & SHF_L3_PAE ) DO_UNPIN(PGC_SH_l3_pae_shadow); +#if CONFIG_PAGING_LEVELS >= 4 + if ( sh_flags & SHF_L1_64 ) DO_UNSHADOW(PGC_SH_l1_64_shadow); + if ( sh_flags & SHF_L2_64 ) DO_UNSHADOW(PGC_SH_l2_64_shadow); + if ( sh_flags & SHF_L3_64 ) DO_UNSHADOW(PGC_SH_l3_64_shadow); + if ( sh_flags & SHF_L4_64 ) DO_UNPIN(PGC_SH_l4_64_shadow); +#endif +#endif + +#undef DO_UNSHADOW +#undef DO_UNPIN + + +#if CONFIG_PAGING_LEVELS > 2 + /* We may have caused some PAE l3 entries to change: need to + * fix up the copies of them in various places */ + if ( sh_flags & (SHF_L2_PAE|SHF_L2H_PAE) ) + sh_pae_recopy(v->domain); +#endif + + /* If that didn't catch the shadows, something is wrong */ + if ( all && (pg->count_info & PGC_page_table) ) + { + SHADOW_ERROR("can't find all shadows of mfn %05lx (shadow_flags=%08x)\n", + mfn_x(gmfn), pg->shadow_flags); + domain_crash(v->domain); + } +} + +void +shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn) +/* Even harsher: this is a HVM page that we thing is no longer a pagetable. + * Unshadow it, and recursively unshadow pages that reference it. */ +{ + shadow_remove_all_shadows(v, gmfn); + /* XXX TODO: + * Rework this hashtable walker to return a linked-list of all + * the shadows it modified, then do breadth-first recursion + * to find the way up to higher-level tables and unshadow them too. + * + * The current code (just tearing down each page's shadows as we + * detect that it is not a pagetable) is correct, but very slow. + * It means extra emulated writes and slows down removal of mappings. */ +} + +/**************************************************************************/ + +void sh_update_paging_modes(struct vcpu *v) +{ + struct domain *d = v->domain; + struct shadow_paging_mode *old_mode = v->arch.shadow.mode; + mfn_t old_guest_table; + + ASSERT(shadow_lock_is_acquired(d)); + + // Valid transitions handled by this function: + // - For PV guests: + // - after a shadow mode has been changed + // - For HVM guests: + // - after a shadow mode has been changed + // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE + // + + // Avoid determining the current shadow mode for uninitialized CPUs, as + // we can not yet determine whether it is an HVM or PV domain. + // + if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) ) + { + printk("%s: postponing determination of shadow mode\n", __func__); + return; + } + + // First, tear down any old shadow tables held by this vcpu. + // + shadow_detach_old_tables(v); + + if ( !hvm_guest(v) ) + { + /// + /// PV guest + /// +#if CONFIG_PAGING_LEVELS == 4 + if ( pv_32bit_guest(v) ) + v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,3); + else + v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4); +#elif CONFIG_PAGING_LEVELS == 3 + v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3); +#elif CONFIG_PAGING_LEVELS == 2 + v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2); +#else +#error unexpected paging mode +#endif + } + else + { + /// + /// HVM guest + /// + ASSERT(shadow_mode_translate(d)); + ASSERT(shadow_mode_external(d)); + + v->arch.shadow.hvm_paging_enabled = !!hvm_paging_enabled(v); + if ( !v->arch.shadow.hvm_paging_enabled ) + { + + /* Set v->arch.guest_table to use the p2m map, and choose + * the appropriate shadow mode */ + old_guest_table = pagetable_get_mfn(v->arch.guest_table); +#if CONFIG_PAGING_LEVELS == 2 + v->arch.guest_table = + pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table)); + v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2); +#elif CONFIG_PAGING_LEVELS == 3 + v->arch.guest_table = + pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table)); + v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3); +#else /* CONFIG_PAGING_LEVELS == 4 */ + { + l4_pgentry_t *l4e; + /* Use the start of the first l3 table as a PAE l3 */ + ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0); + l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); + ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT); + v->arch.guest_table = + pagetable_from_pfn(l4e_get_pfn(l4e[0])); + sh_unmap_domain_page(l4e); + } + v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3); +#endif + /* Fix up refcounts on guest_table */ + get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d); + if ( mfn_x(old_guest_table) != 0 ) + put_page(mfn_to_page(old_guest_table)); + } + else + { +#ifdef __x86_64__ + if ( hvm_long_mode_enabled(v) ) + { + // long mode guest... + v->arch.shadow.mode = + &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4); + } + else +#endif + if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE ) + { +#if CONFIG_PAGING_LEVELS >= 3 + // 32-bit PAE mode guest... + v->arch.shadow.mode = + &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3); +#else + SHADOW_ERROR("PAE not supported in 32-bit Xen\n"); + domain_crash(d); + return; +#endif + } + else + { + // 32-bit 2 level guest... +#if CONFIG_PAGING_LEVELS >= 3 + v->arch.shadow.mode = + &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2); +#else + v->arch.shadow.mode = + &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2); +#endif + } + } + + if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) + { + mfn_t mmfn = shadow_make_monitor_table(v); + v->arch.monitor_table = pagetable_from_mfn(mmfn); + v->arch.monitor_vtable = sh_map_domain_page(mmfn); + } + + if ( v->arch.shadow.mode != old_mode ) + { + SHADOW_PRINTK("new paging mode: d=%u v=%u g=%u s=%u " + "(was g=%u s=%u)\n", + d->domain_id, v->vcpu_id, + v->arch.shadow.mode->guest_levels, + v->arch.shadow.mode->shadow_levels, + old_mode ? old_mode->guest_levels : 0, + old_mode ? old_mode->shadow_levels : 0); + if ( old_mode && + (v->arch.shadow.mode->shadow_levels != + old_mode->shadow_levels) ) + { + /* Need to make a new monitor table for the new mode */ + mfn_t new_mfn, old_mfn; + + if ( v != current ) + { + SHADOW_ERROR("Some third party (d=%u v=%u) is changing " + "this HVM vcpu's (d=%u v=%u) paging mode!\n", + current->domain->domain_id, current->vcpu_id, + v->domain->domain_id, v->vcpu_id); + domain_crash(v->domain); + return; + } + + sh_unmap_domain_page(v->arch.monitor_vtable); + old_mfn = pagetable_get_mfn(v->arch.monitor_table); + v->arch.monitor_table = pagetable_null(); + new_mfn = v->arch.shadow.mode->make_monitor_table(v); + v->arch.monitor_table = pagetable_from_mfn(new_mfn); + v->arch.monitor_vtable = sh_map_domain_page(new_mfn); + SHADOW_PRINTK("new monitor table %"SH_PRI_mfn "\n", + mfn_x(new_mfn)); + + /* Don't be running on the old monitor table when we + * pull it down! Switch CR3, and warn the HVM code that + * its host cr3 has changed. */ + make_cr3(v, mfn_x(new_mfn)); + write_ptbase(v); + hvm_update_host_cr3(v); + old_mode->destroy_monitor_table(v, old_mfn); + } + } + + // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE. + // These are HARD: think about the case where two CPU's have + // different values for CR4.PSE and CR4.PGE at the same time. + // This *does* happen, at least for CR4.PGE... + } + + v->arch.shadow.mode->update_cr3(v); +} + +/**************************************************************************/ +/* Turning on and off shadow features */ + +static void sh_new_mode(struct domain *d, u32 new_mode) +/* Inform all the vcpus that the shadow mode has been changed */ +{ + struct vcpu *v; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(d != current->domain); + d->arch.shadow.mode = new_mode; + if ( new_mode & SHM2_translate ) + shadow_audit_p2m(d); + for_each_vcpu(d, v) + sh_update_paging_modes(v); +} + +static int shadow_enable(struct domain *d, u32 mode) +/* Turn on "permanent" shadow features: external, translate, refcount. + * Can only be called once on a domain, and these features cannot be + * disabled. + * Returns 0 for success, -errno for failure. */ +{ + unsigned int old_pages; + int rv = 0; + + mode |= SHM2_enable; + + domain_pause(d); + shadow_lock(d); + + /* Sanity check the arguments */ + if ( (d == current->domain) || + shadow_mode_enabled(d) || + ((mode & SHM2_external) && !(mode & SHM2_translate)) ) + { + rv = -EINVAL; + goto out; + } + + // XXX -- eventually would like to require that all memory be allocated + // *after* shadow_enabled() is called... So here, we would test to make + // sure that d->page_list is empty. +#if 0 + spin_lock(&d->page_alloc_lock); + if ( !list_empty(&d->page_list) ) + { + spin_unlock(&d->page_alloc_lock); + rv = -EINVAL; + goto out; + } + spin_unlock(&d->page_alloc_lock); +#endif + + /* Init the shadow memory allocation if the user hasn't done so */ + old_pages = d->arch.shadow.total_pages; + if ( old_pages == 0 ) + if ( set_sh_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */ + { + set_sh_allocation(d, 0, NULL); + rv = -ENOMEM; + goto out; + } + + /* Init the hash table */ + if ( shadow_hash_alloc(d) != 0 ) + { + set_sh_allocation(d, old_pages, NULL); + rv = -ENOMEM; + goto out; + } + + /* Init the P2M table */ + if ( mode & SHM2_translate ) + if ( !shadow_alloc_p2m_table(d) ) + { + shadow_hash_teardown(d); + set_sh_allocation(d, old_pages, NULL); + shadow_p2m_teardown(d); + rv = -ENOMEM; + goto out; + } + + /* Update the bits */ + sh_new_mode(d, mode); + shadow_audit_p2m(d); + out: + shadow_unlock(d); + domain_unpause(d); + return 0; +} + +void shadow_teardown(struct domain *d) +/* Destroy the shadow pagetables of this domain and free its shadow memory. + * Should only be called for dying domains. */ +{ + struct vcpu *v; + mfn_t mfn; + + ASSERT(test_bit(_DOMF_dying, &d->domain_flags)); + ASSERT(d != current->domain); + + if ( !shadow_lock_is_acquired(d) ) + shadow_lock(d); /* Keep various asserts happy */ + + if ( shadow_mode_enabled(d) ) + { + /* Release the shadow and monitor tables held by each vcpu */ + for_each_vcpu(d, v) + { + shadow_detach_old_tables(v); + if ( shadow_mode_external(d) ) + { + mfn = pagetable_get_mfn(v->arch.monitor_table); + if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) ) + shadow_destroy_monitor_table(v, mfn); + v->arch.monitor_table = pagetable_null(); + } + } + } + + if ( d->arch.shadow.total_pages != 0 ) + { + SHADOW_PRINTK("teardown of domain %u starts." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow.total_pages, + d->arch.shadow.free_pages, + d->arch.shadow.p2m_pages); + /* Destroy all the shadows and release memory to domheap */ + set_sh_allocation(d, 0, NULL); + /* Release the hash table back to xenheap */ + if (d->arch.shadow.hash_table) + shadow_hash_teardown(d); + /* Release the log-dirty bitmap of dirtied pages */ + sh_free_log_dirty_bitmap(d); + /* Should not have any more memory held */ + SHADOW_PRINTK("teardown done." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->arch.shadow.total_pages, + d->arch.shadow.free_pages, + d->arch.shadow.p2m_pages); + ASSERT(d->arch.shadow.total_pages == 0); + } + + /* We leave the "permanent" shadow modes enabled, but clear the + * log-dirty mode bit. We don't want any more mark_dirty() + * calls now that we've torn down the bitmap */ + d->arch.shadow.mode &= ~SHM2_log_dirty; + + shadow_unlock(d); +} + +void shadow_final_teardown(struct domain *d) +/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */ +{ + + SHADOW_PRINTK("dom %u final teardown starts." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow.total_pages, + d->arch.shadow.free_pages, + d->arch.shadow.p2m_pages); + + /* Double-check that the domain didn't have any shadow memory. + * It is possible for a domain that never got domain_kill()ed + * to get here with its shadow allocation intact. */ + if ( d->arch.shadow.total_pages != 0 ) + shadow_teardown(d); + + /* It is now safe to pull down the p2m map. */ + if ( d->arch.shadow.p2m_pages != 0 ) + shadow_p2m_teardown(d); + + SHADOW_PRINTK("dom %u final teardown done." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow.total_pages, + d->arch.shadow.free_pages, + d->arch.shadow.p2m_pages); +} + +static int shadow_one_bit_enable(struct domain *d, u32 mode) +/* Turn on a single shadow mode feature */ +{ + ASSERT(shadow_lock_is_acquired(d)); + + /* Sanity check the call */ + if ( d == current->domain || (d->arch.shadow.mode & mode) ) + { + return -EINVAL; + } + + if ( d->arch.shadow.mode == 0 ) + { + /* Init the shadow memory allocation and the hash table */ + if ( set_sh_allocation(d, 1, NULL) != 0 + || shadow_hash_alloc(d) != 0 ) + { + set_sh_allocation(d, 0, NULL); + return -ENOMEM; + } + } + + /* Update the bits */ + sh_new_mode(d, d->arch.shadow.mode | mode); + + return 0; +} + +static int shadow_one_bit_disable(struct domain *d, u32 mode) +/* Turn off a single shadow mode feature */ +{ + struct vcpu *v; + ASSERT(shadow_lock_is_acquired(d)); + + /* Sanity check the call */ + if ( d == current->domain || !(d->arch.shadow.mode & mode) ) + { + return -EINVAL; + } + + /* Update the bits */ + sh_new_mode(d, d->arch.shadow.mode & ~mode); + if ( d->arch.shadow.mode == 0 ) + { + /* Get this domain off shadows */ + SHADOW_PRINTK("un-shadowing of domain %u starts." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow.total_pages, + d->arch.shadow.free_pages, + d->arch.shadow.p2m_pages); + for_each_vcpu(d, v) + { + shadow_detach_old_tables(v); +#if CONFIG_PAGING_LEVELS == 4 + if ( !(v->arch.flags & TF_kernel_mode) ) + make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user)); + else +#endif + make_cr3(v, pagetable_get_pfn(v->arch.guest_table)); + + } + + /* Pull down the memory allocation */ + if ( set_sh_allocation(d, 0, NULL) != 0 ) + { + // XXX - How can this occur? + // Seems like a bug to return an error now that we've + // disabled the relevant shadow mode. + // + return -ENOMEM; + } + shadow_hash_teardown(d); + SHADOW_PRINTK("un-shadowing of domain %u done." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow.total_pages, + d->arch.shadow.free_pages, + d->arch.shadow.p2m_pages); + } + + return 0; +} + +/* Enable/disable ops for the "test" and "log-dirty" modes */ +int shadow_test_enable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow_lock(d); + + if ( shadow_mode_enabled(d) ) + { + SHADOW_ERROR("Don't support enabling test mode" + "on already shadowed doms\n"); + ret = -EINVAL; + goto out; + } + + ret = shadow_one_bit_enable(d, SHM2_enable); + out: + shadow_unlock(d); + domain_unpause(d); + + return ret; +} + +int shadow_test_disable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow_lock(d); + ret = shadow_one_bit_disable(d, SHM2_enable); + shadow_unlock(d); + domain_unpause(d); + + return ret; +} + +static int +sh_alloc_log_dirty_bitmap(struct domain *d) +{ + ASSERT(d->arch.shadow.dirty_bitmap == NULL); + d->arch.shadow.dirty_bitmap_size = + (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) & + ~(BITS_PER_LONG - 1); + d->arch.shadow.dirty_bitmap = + xmalloc_array(unsigned long, + d->arch.shadow.dirty_bitmap_size / BITS_PER_LONG); + if ( d->arch.shadow.dirty_bitmap == NULL ) + { + d->arch.shadow.dirty_bitmap_size = 0; + return -ENOMEM; + } + memset(d->arch.shadow.dirty_bitmap, 0, d->arch.shadow.dirty_bitmap_size/8); + + return 0; +} + +static void +sh_free_log_dirty_bitmap(struct domain *d) +{ + d->arch.shadow.dirty_bitmap_size = 0; + if ( d->arch.shadow.dirty_bitmap ) + { + xfree(d->arch.shadow.dirty_bitmap); + d->arch.shadow.dirty_bitmap = NULL; + } +} + +static int shadow_log_dirty_enable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow_lock(d); + + if ( shadow_mode_log_dirty(d) ) + { + ret = -EINVAL; + goto out; + } + + if ( shadow_mode_enabled(d) ) + { + SHADOW_ERROR("Don't (yet) support enabling log-dirty" + "on already shadowed doms\n"); + ret = -EINVAL; + goto out; + } + + ret = sh_alloc_log_dirty_bitmap(d); + if ( ret != 0 ) + { + sh_free_log_dirty_bitmap(d); + goto out; + } + + ret = shadow_one_bit_enable(d, SHM2_log_dirty); + if ( ret != 0 ) + sh_free_log_dirty_bitmap(d); + + out: + shadow_unlock(d); + domain_unpause(d); + return ret; +} + +static int shadow_log_dirty_disable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow_lock(d); + ret = shadow_one_bit_disable(d, SHM2_log_dirty); + if ( !shadow_mode_log_dirty(d) ) + sh_free_log_dirty_bitmap(d); + shadow_unlock(d); + domain_unpause(d); + + return ret; +} + +/**************************************************************************/ +/* P2M map manipulations */ + +static void +sh_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn) +{ + struct vcpu *v; + + if ( !shadow_mode_translate(d) ) + return; + + v = current; + if ( v->domain != d ) + v = d->vcpu[0]; + + + SHADOW_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn); + + ASSERT(mfn_x(sh_gfn_to_mfn(d, gfn)) == mfn); + //ASSERT(sh_mfn_to_gfn(d, mfn) == gfn); + + shadow_remove_all_shadows_and_parents(v, _mfn(mfn)); + if ( shadow_remove_all_mappings(v, _mfn(mfn)) ) + flush_tlb_mask(d->domain_dirty_cpumask); + shadow_set_p2m_entry(d, gfn, _mfn(INVALID_MFN)); + set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); +} + +void +shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn, + unsigned long mfn) +{ + shadow_lock(d); + shadow_audit_p2m(d); + sh_p2m_remove_page(d, gfn, mfn); + shadow_audit_p2m(d); + shadow_unlock(d); +} + +void +shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn, + unsigned long mfn) +{ + struct vcpu *v; + unsigned long ogfn; + mfn_t omfn; + + if ( !shadow_mode_translate(d) ) + return; + + v = current; + if ( v->domain != d ) + v = d->vcpu[0]; + + shadow_lock(d); + shadow_audit_p2m(d); + + SHADOW_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn); + + omfn = sh_gfn_to_mfn(d, gfn); + if ( valid_mfn(omfn) ) + { + /* Get rid of the old mapping, especially any shadows */ + shadow_remove_all_shadows_and_parents(v, omfn); + if ( shadow_remove_all_mappings(v, omfn) ) + flush_tlb_mask(d->domain_dirty_cpumask); + set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); + } + + ogfn = sh_mfn_to_gfn(d, _mfn(mfn)); + if ( +#ifdef __x86_64__ + (ogfn != 0x5555555555555555L) +#else + (ogfn != 0x55555555L) +#endif + && (ogfn != INVALID_M2P_ENTRY) + && (ogfn != gfn) ) + { + /* This machine frame is already mapped at another physical address */ + SHADOW_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n", + mfn, ogfn, gfn); + if ( valid_mfn(omfn = sh_gfn_to_mfn(d, ogfn)) ) + { + SHADOW_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n", + ogfn , mfn_x(omfn)); + if ( mfn_x(omfn) == mfn ) + sh_p2m_remove_page(d, ogfn, mfn); + } + } + + shadow_set_p2m_entry(d, gfn, _mfn(mfn)); + set_gpfn_from_mfn(mfn, gfn); + shadow_audit_p2m(d); + shadow_unlock(d); +} + +/**************************************************************************/ +/* Log-dirty mode support */ + +/* Convert a shadow to log-dirty mode. */ +void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn) +{ + BUG(); +} + + +/* Read a domain's log-dirty bitmap and stats. + * If the operation is a CLEAN, clear the bitmap and stats as well. */ +static int shadow_log_dirty_op( + struct domain *d, struct xen_domctl_shadow_op *sc) +{ + int i, rv = 0, clean = 0; + + domain_pause(d); + shadow_lock(d); + + clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN); + + SHADOW_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n", + (clean) ? "clean" : "peek", + d->domain_id, + d->arch.shadow.fault_count, + d->arch.shadow.dirty_count); + + sc->stats.fault_count = d->arch.shadow.fault_count; + sc->stats.dirty_count = d->arch.shadow.dirty_count; + + if ( clean ) + { + struct list_head *l, *t; + struct page_info *pg; + + /* Need to revoke write access to the domain's pages again. + * In future, we'll have a less heavy-handed approach to this, + * but for now, we just unshadow everything except Xen. */ + list_for_each_safe(l, t, &d->arch.shadow.toplevel_shadows) + { + pg = list_entry(l, struct page_info, list); + shadow_unhook_mappings(d->vcpu[0], page_to_mfn(pg)); + } + + d->arch.shadow.fault_count = 0; + d->arch.shadow.dirty_count = 0; + } + + if ( guest_handle_is_null(sc->dirty_bitmap) || + (d->arch.shadow.dirty_bitmap == NULL) ) + { + rv = -EINVAL; + goto out; + } + + if ( sc->pages > d->arch.shadow.dirty_bitmap_size ) + sc->pages = d->arch.shadow.dirty_bitmap_size; + +#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */ + for ( i = 0; i < sc->pages; i += CHUNK ) + { + int bytes = ((((sc->pages - i) > CHUNK) + ? CHUNK + : (sc->pages - i)) + 7) / 8; + + if ( copy_to_guest_offset( + sc->dirty_bitmap, + i/(8*sizeof(unsigned long)), + d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))), + (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) ) + { + rv = -EINVAL; + goto out; + } + + if ( clean ) + memset(d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))), + 0, bytes); + } +#undef CHUNK + + out: + shadow_unlock(d); + domain_unpause(d); + return 0; +} + + +/* Mark a page as dirty */ +void sh_do_mark_dirty(struct domain *d, mfn_t gmfn) +{ + unsigned long pfn; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(shadow_mode_log_dirty(d)); + + if ( !valid_mfn(gmfn) ) + return; + + ASSERT(d->arch.shadow.dirty_bitmap != NULL); + + /* We /really/ mean PFN here, even for non-translated guests. */ + pfn = get_gpfn_from_mfn(mfn_x(gmfn)); + + /* + * Values with the MSB set denote MFNs that aren't really part of the + * domain's pseudo-physical memory map (e.g., the shared info frame). + * Nothing to do here... + */ + if ( unlikely(!VALID_M2P(pfn)) ) + return; + + /* N.B. Can use non-atomic TAS because protected by shadow_lock. */ + if ( likely(pfn < d->arch.shadow.dirty_bitmap_size) ) + { + if ( !__test_and_set_bit(pfn, d->arch.shadow.dirty_bitmap) ) + { + SHADOW_DEBUG(LOGDIRTY, + "marked mfn %" SH_PRI_mfn " (pfn=%lx), dom %d\n", + mfn_x(gmfn), pfn, d->domain_id); + d->arch.shadow.dirty_count++; + } + } + else + { + SHADOW_PRINTK("mark_dirty OOR! " + "mfn=%" SH_PRI_mfn " pfn=%lx max=%x (dom %d)\n" + "owner=%d c=%08x t=%" PRtype_info "\n", + mfn_x(gmfn), + pfn, + d->arch.shadow.dirty_bitmap_size, + d->domain_id, + (page_get_owner(mfn_to_page(gmfn)) + ? page_get_owner(mfn_to_page(gmfn))->domain_id + : -1), + mfn_to_page(gmfn)->count_info, + mfn_to_page(gmfn)->u.inuse.type_info); + } +} + + +/**************************************************************************/ +/* Shadow-control XEN_DOMCTL dispatcher */ + +int shadow_domctl(struct domain *d, + xen_domctl_shadow_op_t *sc, + XEN_GUEST_HANDLE(xen_domctl_t) u_domctl) +{ + int rc, preempted = 0; + + if ( unlikely(d == current->domain) ) + { + DPRINTK("Don't try to do a shadow op on yourself!\n"); + return -EINVAL; + } + + switch ( sc->op ) + { + case XEN_DOMCTL_SHADOW_OP_OFF: + if ( shadow_mode_log_dirty(d) ) + if ( (rc = shadow_log_dirty_disable(d)) != 0 ) + return rc; + if ( d->arch.shadow.mode & SHM2_enable ) + if ( (rc = shadow_test_disable(d)) != 0 ) + return rc; + return 0; + + case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST: + return shadow_test_enable(d); + + case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY: + return shadow_log_dirty_enable(d); + + case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE: + return shadow_enable(d, SHM2_refcounts|SHM2_translate); + + case XEN_DOMCTL_SHADOW_OP_CLEAN: + case XEN_DOMCTL_SHADOW_OP_PEEK: + return shadow_log_dirty_op(d, sc); + + case XEN_DOMCTL_SHADOW_OP_ENABLE: + if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY ) + return shadow_log_dirty_enable(d); + return shadow_enable(d, sc->mode << SHM2_shift); + + case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: + sc->mb = shadow_get_allocation(d); + return 0; + + case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: + rc = shadow_set_allocation(d, sc->mb, &preempted); + if ( preempted ) + /* Not finished. Set up to re-run the call. */ + rc = hypercall_create_continuation( + __HYPERVISOR_domctl, "h", u_domctl); + else + /* Finished. Return the new allocation */ + sc->mb = shadow_get_allocation(d); + return rc; + + default: + SHADOW_ERROR("Bad shadow op %u\n", sc->op); + return -EINVAL; + } +} + + +/**************************************************************************/ +/* Auditing shadow tables */ + +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL + +void shadow_audit_tables(struct vcpu *v) +{ + /* Dispatch table for getting per-type functions */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2), /* l1_32 */ + SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32 */ + SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2), /* l2_32 */ +#else + SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2), /* l1_32 */ + SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32 */ + SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2), /* l2_32 */ + SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3), /* l1_pae */ + SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */ + SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2_pae */ + SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2h_pae */ + SHADOW_INTERNAL_NAME(sh_audit_l3_table,3,3), /* l3_pae */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4), /* l1_64 */ + SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64 */ + SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2_64 */ + SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4), /* l3_64 */ + SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4), /* l4_64 */ +#endif /* CONFIG_PAGING_LEVELS >= 4 */ +#endif /* CONFIG_PAGING_LEVELS > 2 */ + NULL /* All the rest */ + }; + unsigned int mask; + + if ( !(SHADOW_AUDIT_ENABLE) ) + return; + + if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL ) + mask = ~1; /* Audit every table in the system */ + else + { + /* Audit only the current mode's tables */ + switch ( v->arch.shadow.mode->guest_levels ) + { + case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break; + case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE + |SHF_L2H_PAE|SHF_L3_PAE); break; + case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64 + |SHF_L3_64|SHF_L4_64); break; + default: BUG(); + } + } + + hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN)); +} + +#endif /* Shadow audit */ + + +/**************************************************************************/ +/* Auditing p2m tables */ + +#if SHADOW_AUDIT & SHADOW_AUDIT_P2M + +void shadow_audit_p2m(struct domain *d) +{ + struct list_head *entry; + struct page_info *page; + struct domain *od; + unsigned long mfn, gfn, m2pfn, lp2mfn = 0; + mfn_t p2mfn; + unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0; + int test_linear; + + if ( !(SHADOW_AUDIT_ENABLE) || !shadow_mode_translate(d) ) + return; + + //SHADOW_PRINTK("p2m audit starts\n"); + + test_linear = ( (d == current->domain) && current->arch.monitor_vtable ); + if ( test_linear ) + local_flush_tlb(); + + /* Audit part one: walk the domain's page allocation list, checking + * the m2p entries. */ + for ( entry = d->page_list.next; + entry != &d->page_list; + entry = entry->next ) + { + page = list_entry(entry, struct page_info, list); + mfn = mfn_x(page_to_mfn(page)); + + // SHADOW_PRINTK("auditing guest page, mfn=%#lx\n", mfn); + + od = page_get_owner(page); + + if ( od != d ) + { + SHADOW_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n", + mfn, od, (od?od->domain_id:-1), d, d->domain_id); + continue; + } + + gfn = get_gpfn_from_mfn(mfn); + if ( gfn == INVALID_M2P_ENTRY ) + { + orphans_i++; + //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n", + // mfn); + continue; + } + + if ( gfn == 0x55555555 ) + { + orphans_d++; + //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n", + // mfn); + continue; + } + + p2mfn = sh_gfn_to_mfn_foreign(d, gfn); + if ( mfn_x(p2mfn) != mfn ) + { + mpbad++; + SHADOW_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx" + " (-> gfn %#lx)\n", + mfn, gfn, mfn_x(p2mfn), + (mfn_valid(p2mfn) + ? get_gpfn_from_mfn(mfn_x(p2mfn)) + : -1u)); + /* This m2p entry is stale: the domain has another frame in + * this physical slot. No great disaster, but for neatness, + * blow away the m2p entry. */ + set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); + } + + if ( test_linear ) + { + lp2mfn = get_mfn_from_gpfn(gfn); + if ( lp2mfn != mfn_x(p2mfn) ) + { + SHADOW_PRINTK("linear mismatch gfn %#lx -> mfn %#lx " + "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn); + } + } + + // SHADOW_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n", + // mfn, gfn, p2mfn, lp2mfn); + } + + /* Audit part two: walk the domain's p2m table, checking the entries. */ + if ( pagetable_get_pfn(d->arch.phys_table) != 0 ) + { + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + int i1, i2; + +#if CONFIG_PAGING_LEVELS == 4 + l4_pgentry_t *l4e; + l3_pgentry_t *l3e; + int i3, i4; + l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); +#elif CONFIG_PAGING_LEVELS == 3 + l3_pgentry_t *l3e; + int i3; + l3e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); +#else /* CONFIG_PAGING_LEVELS == 2 */ + l2e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); +#endif + + gfn = 0; +#if CONFIG_PAGING_LEVELS >= 3 +#if CONFIG_PAGING_LEVELS >= 4 + for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ ) + { + if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l3e = sh_map_domain_page(_mfn(l4e_get_pfn(l4e[i4]))); +#endif /* now at levels 3 or 4... */ + for ( i3 = 0; + i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); + i3++ ) + { + if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[i3]))); +#endif /* all levels... */ + for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) + { + if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l1e = sh_map_domain_page(_mfn(l2e_get_pfn(l2e[i2]))); + + for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ ) + { + if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) ) + continue; + mfn = l1e_get_pfn(l1e[i1]); + ASSERT(valid_mfn(_mfn(mfn))); + m2pfn = get_gpfn_from_mfn(mfn); + if ( m2pfn != gfn ) + { + pmbad++; + SHADOW_PRINTK("mismatch: gfn %#lx -> mfn %#lx" + " -> gfn %#lx\n", gfn, mfn, m2pfn); + BUG(); + } + } + sh_unmap_domain_page(l1e); + } +#if CONFIG_PAGING_LEVELS >= 3 + sh_unmap_domain_page(l2e); + } +#if CONFIG_PAGING_LEVELS >= 4 + sh_unmap_domain_page(l3e); + } +#endif +#endif + +#if CONFIG_PAGING_LEVELS == 4 + sh_unmap_domain_page(l4e); +#elif CONFIG_PAGING_LEVELS == 3 + sh_unmap_domain_page(l3e); +#else /* CONFIG_PAGING_LEVELS == 2 */ + sh_unmap_domain_page(l2e); +#endif + + } + + //SHADOW_PRINTK("p2m audit complete\n"); + //if ( orphans_i | orphans_d | mpbad | pmbad ) + // SHADOW_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n", + // orphans_i + orphans_d, orphans_i, orphans_d, + if ( mpbad | pmbad ) + SHADOW_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n", + pmbad, mpbad); +} + +#endif /* p2m audit */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c new file mode 100644 index 0000000000..aed36c5f34 --- /dev/null +++ b/xen/arch/x86/mm/shadow/multi.c @@ -0,0 +1,4492 @@ +/****************************************************************************** + * arch/x86/mm/shadow/multi.c + * + * Simple, mostly-synchronous shadow page tables. + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +// DESIGN QUESTIONS: +// Why use subshadows for PAE guests? +// - reduces pressure in the hash table +// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3) +// - would need to find space in the page_info to store 7 more bits of +// backpointer +// - independent shadows of 32 byte chunks makes it non-obvious how to quickly +// figure out when to demote the guest page from l3 status +// +// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space. +// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address +// space for both PV and HVM guests. +// + +#define SHADOW 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "private.h" +#include "types.h" + +/* The first cut: an absolutely synchronous, trap-and-emulate version, + * supporting only HVM guests (and so only "external" shadow mode). + * + * THINGS TO DO LATER: + * + * FIX GVA_TO_GPA + * The current interface returns an unsigned long, which is not big enough + * to hold a physical address in PAE. Should return a gfn instead. + * + * TEARDOWN HEURISTICS + * Also: have a heuristic for when to destroy a previous paging-mode's + * shadows. When a guest is done with its start-of-day 32-bit tables + * and reuses the memory we want to drop those shadows. Start with + * shadows in a page in two modes as a hint, but beware of clever tricks + * like reusing a pagetable for both PAE and 64-bit during boot... + * + * PAE LINEAR MAPS + * Rework shadow_get_l*e() to have the option of using map_domain_page() + * instead of linear maps. Add appropriate unmap_l*e calls in the users. + * Then we can test the speed difference made by linear maps. If the + * map_domain_page() version is OK on PAE, we could maybe allow a lightweight + * l3-and-l2h-only shadow mode for PAE PV guests that would allow them + * to share l2h pages again. + * + * PAE L3 COPYING + * In this code, we copy all 32 bytes of a PAE L3 every time we change an + * entry in it, and every time we change CR3. We copy it for the linear + * mappings (ugh! PAE linear mappings) and we copy it to the low-memory + * buffer so it fits in CR3. Maybe we can avoid some of this recopying + * by using the shadow directly in some places. + * Also, for SMP, need to actually respond to seeing shadow.pae_flip_pending. + * + * GUEST_WALK_TABLES TLB FLUSH COALESCE + * guest_walk_tables can do up to three remote TLB flushes as it walks to + * the first l1 of a new pagetable. Should coalesce the flushes to the end, + * and if we do flush, re-do the walk. If anything has changed, then + * pause all the other vcpus and do the walk *again*. + * + * WP DISABLED + * Consider how to implement having the WP bit of CR0 set to 0. + * Since we need to be able to cause write faults to pagetables, this might + * end up looking like not having the (guest) pagetables present at all in + * HVM guests... + * + * PSE disabled / PSE36 + * We don't support any modes other than PSE enabled, PSE36 disabled. + * Neither of those would be hard to change, but we'd need to be able to + * deal with shadows made in one mode and used in another. + */ + +#define FETCH_TYPE_PREFETCH 1 +#define FETCH_TYPE_DEMAND 2 +#define FETCH_TYPE_WRITE 4 +typedef enum { + ft_prefetch = FETCH_TYPE_PREFETCH, + ft_demand_read = FETCH_TYPE_DEMAND, + ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE, +} fetch_type_t; + +#ifdef DEBUG_TRACE_DUMP +static char *fetch_type_names[] = { + [ft_prefetch] "prefetch", + [ft_demand_read] "demand read", + [ft_demand_write] "demand write", +}; +#endif + +/* XXX forward declarations */ +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) +static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res); +#endif +static inline void sh_update_linear_entries(struct vcpu *v); + +/**************************************************************************/ +/* Hash table mapping from guest pagetables to shadows + * + * Normal case: maps the mfn of a guest page to the mfn of its shadow page. + * FL1's: maps the *gfn* of the start of a superpage to the mfn of a + * shadow L1 which maps its "splinters". + * PAE CR3s: maps the 32-byte aligned, 32-bit CR3 value to the mfn of the + * PAE L3 info page for that CR3 value. + */ + +static inline mfn_t +get_fl1_shadow_status(struct vcpu *v, gfn_t gfn) +/* Look for FL1 shadows in the hash table */ +{ + mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), + PGC_SH_fl1_shadow >> PGC_SH_type_shift); + + if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) ) + { + struct page_info *page = mfn_to_page(smfn); + if ( !(page->count_info & PGC_SH_log_dirty) ) + shadow_convert_to_log_dirty(v, smfn); + } + + return smfn; +} + +static inline mfn_t +get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type) +/* Look for shadows in the hash table */ +{ + mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), + shadow_type >> PGC_SH_type_shift); + perfc_incrc(shadow_get_shadow_status); + + if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) ) + { + struct page_info *page = mfn_to_page(smfn); + if ( !(page->count_info & PGC_SH_log_dirty) ) + shadow_convert_to_log_dirty(v, smfn); + } + + return smfn; +} + +static inline void +set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn) +/* Put an FL1 shadow into the hash table */ +{ + SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n", + gfn_x(gfn), PGC_SH_fl1_shadow, mfn_x(smfn)); + + if ( unlikely(shadow_mode_log_dirty(v->domain)) ) + // mark this shadow as a log dirty shadow... + set_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info); + else + clear_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info); + + shadow_hash_insert(v, gfn_x(gfn), + PGC_SH_fl1_shadow >> PGC_SH_type_shift, smfn); +} + +static inline void +set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn) +/* Put a shadow into the hash table */ +{ + struct domain *d = v->domain; + int res; + + SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n", + d->domain_id, v->vcpu_id, mfn_x(gmfn), + shadow_type, mfn_x(smfn)); + + if ( unlikely(shadow_mode_log_dirty(d)) ) + // mark this shadow as a log dirty shadow... + set_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info); + else + clear_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info); + + res = get_page(mfn_to_page(gmfn), d); + ASSERT(res == 1); + + shadow_hash_insert(v, mfn_x(gmfn), shadow_type >> PGC_SH_type_shift, + smfn); +} + +static inline void +delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn) +/* Remove a shadow from the hash table */ +{ + SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n", + gfn_x(gfn), PGC_SH_fl1_shadow, mfn_x(smfn)); + + shadow_hash_delete(v, gfn_x(gfn), + PGC_SH_fl1_shadow >> PGC_SH_type_shift, smfn); +} + +static inline void +delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn) +/* Remove a shadow from the hash table */ +{ + SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n", + v->domain->domain_id, v->vcpu_id, + mfn_x(gmfn), shadow_type, mfn_x(smfn)); + shadow_hash_delete(v, mfn_x(gmfn), + shadow_type >> PGC_SH_type_shift, smfn); + put_page(mfn_to_page(gmfn)); +} + +/**************************************************************************/ +/* CPU feature support querying */ + +static inline int +guest_supports_superpages(struct vcpu *v) +{ + /* The _PAGE_PSE bit must be honoured in HVM guests, whenever + * CR4.PSE is set or the guest is in PAE or long mode */ + return (hvm_guest(v) && (GUEST_PAGING_LEVELS != 2 + || (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE))); +} + +static inline int +guest_supports_nx(struct vcpu *v) +{ + if ( !hvm_guest(v) ) + return cpu_has_nx; + + // XXX - fix this! + return 1; +} + + +/**************************************************************************/ +/* Functions for walking the guest page tables */ + + +/* Walk the guest pagetables, filling the walk_t with what we see. + * Takes an uninitialised walk_t. The caller must call unmap_walk() + * on the walk_t before discarding it or calling guest_walk_tables again. + * If "guest_op" is non-zero, we are serving a genuine guest memory access, + * and must (a) be under the shadow lock, and (b) remove write access + * from any gueat PT pages we see, as we will be using their contents to + * perform shadow updates. + * Returns 0 for success or non-zero if the guest pagetables are malformed. + * N.B. Finding a not-present entry does not cause a non-zero return code. */ +static inline int +guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op) +{ + ASSERT(!guest_op || shadow_lock_is_acquired(v->domain)); + + perfc_incrc(shadow_guest_walk); + memset(gw, 0, sizeof(*gw)); + gw->va = va; + +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + /* Get l4e from the top level table */ + gw->l4mfn = pagetable_get_mfn(v->arch.guest_table); + gw->l4e = (guest_l4e_t *)v->arch.guest_vtable + guest_l4_table_offset(va); + /* Walk down to the l3e */ + if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0; + gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e)); + if ( !valid_mfn(gw->l3mfn) ) return 1; + /* This mfn is a pagetable: make sure the guest can't write to it. */ + if ( guest_op && shadow_remove_write_access(v, gw->l3mfn, 3, va) != 0 ) + flush_tlb_mask(v->domain->domain_dirty_cpumask); + gw->l3e = ((guest_l3e_t *)sh_map_domain_page(gw->l3mfn)) + + guest_l3_table_offset(va); +#else /* PAE only... */ + /* Get l3e from the top level table */ + gw->l3mfn = pagetable_get_mfn(v->arch.guest_table); + gw->l3e = (guest_l3e_t *)v->arch.guest_vtable + guest_l3_table_offset(va); +#endif /* PAE or 64... */ + /* Walk down to the l2e */ + if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0; + gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e)); + if ( !valid_mfn(gw->l2mfn) ) return 1; + /* This mfn is a pagetable: make sure the guest can't write to it. */ + if ( guest_op && shadow_remove_write_access(v, gw->l2mfn, 2, va) != 0 ) + flush_tlb_mask(v->domain->domain_dirty_cpumask); + gw->l2e = ((guest_l2e_t *)sh_map_domain_page(gw->l2mfn)) + + guest_l2_table_offset(va); +#else /* 32-bit only... */ + /* Get l2e from the top level table */ + gw->l2mfn = pagetable_get_mfn(v->arch.guest_table); + gw->l2e = (guest_l2e_t *)v->arch.guest_vtable + guest_l2_table_offset(va); +#endif /* All levels... */ + + if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0; + if ( guest_supports_superpages(v) && + (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) ) + { + /* Special case: this guest VA is in a PSE superpage, so there's + * no guest l1e. We make one up so that the propagation code + * can generate a shadow l1 table. Start with the gfn of the + * first 4k-page of the superpage. */ + gfn_t start = guest_l2e_get_gfn(*gw->l2e); + /* Grant full access in the l1e, since all the guest entry's + * access controls are enforced in the shadow l2e. This lets + * us reflect l2 changes later without touching the l1s. */ + int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| + _PAGE_ACCESSED|_PAGE_DIRTY); + /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7 + * of the level 1 */ + if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) ) + flags |= _PAGE_PAT; + /* Increment the pfn by the right number of 4k pages. + * The ~0x1 is to mask out the PAT bit mentioned above. */ + start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va)); + gw->eff_l1e = guest_l1e_from_gfn(start, flags); + gw->l1e = NULL; + gw->l1mfn = _mfn(INVALID_MFN); + } + else + { + /* Not a superpage: carry on and find the l1e. */ + gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e)); + if ( !valid_mfn(gw->l1mfn) ) return 1; + /* This mfn is a pagetable: make sure the guest can't write to it. */ + if ( guest_op + && shadow_remove_write_access(v, gw->l1mfn, 1, va) != 0 ) + flush_tlb_mask(v->domain->domain_dirty_cpumask); + gw->l1e = ((guest_l1e_t *)sh_map_domain_page(gw->l1mfn)) + + guest_l1_table_offset(va); + gw->eff_l1e = *gw->l1e; + } + + return 0; +} + +/* Given a walk_t, translate the gw->va into the guest's notion of the + * corresponding frame number. */ +static inline gfn_t +guest_walk_to_gfn(walk_t *gw) +{ + if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) ) + return _gfn(INVALID_GFN); + return guest_l1e_get_gfn(gw->eff_l1e); +} + +/* Given a walk_t, translate the gw->va into the guest's notion of the + * corresponding physical address. */ +static inline paddr_t +guest_walk_to_gpa(walk_t *gw) +{ + if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) ) + return 0; + return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK); +} + + +/* Unmap (and reinitialise) a guest walk. + * Call this to dispose of any walk filled in by guest_walk_tables() */ +static void unmap_walk(struct vcpu *v, walk_t *gw) +{ +#if GUEST_PAGING_LEVELS >= 3 +#if GUEST_PAGING_LEVELS >= 4 + if ( gw->l3e != NULL ) sh_unmap_domain_page(gw->l3e); +#endif + if ( gw->l2e != NULL ) sh_unmap_domain_page(gw->l2e); +#endif + if ( gw->l1e != NULL ) sh_unmap_domain_page(gw->l1e); +#ifdef DEBUG + memset(gw, 0, sizeof(*gw)); +#endif +} + + +/* Pretty-print the contents of a guest-walk */ +static inline void print_gw(walk_t *gw) +{ + SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va); +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + SHADOW_PRINTK(" l4mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l4mfn)); + SHADOW_PRINTK(" l4e=%p\n", gw->l4e); + if ( gw->l4e ) + SHADOW_PRINTK(" *l4e=%" SH_PRI_gpte "\n", gw->l4e->l4); +#endif /* PAE or 64... */ + SHADOW_PRINTK(" l3mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l3mfn)); + SHADOW_PRINTK(" l3e=%p\n", gw->l3e); + if ( gw->l3e ) + SHADOW_PRINTK(" *l3e=%" SH_PRI_gpte "\n", gw->l3e->l3); +#endif /* All levels... */ + SHADOW_PRINTK(" l2mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l2mfn)); + SHADOW_PRINTK(" l2e=%p\n", gw->l2e); + if ( gw->l2e ) + SHADOW_PRINTK(" *l2e=%" SH_PRI_gpte "\n", gw->l2e->l2); + SHADOW_PRINTK(" l1mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l1mfn)); + SHADOW_PRINTK(" l1e=%p\n", gw->l1e); + if ( gw->l1e ) + SHADOW_PRINTK(" *l1e=%" SH_PRI_gpte "\n", gw->l1e->l1); + SHADOW_PRINTK(" eff_l1e=%" SH_PRI_gpte "\n", gw->eff_l1e.l1); +} + + +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES +/* Lightweight audit: pass all the shadows associated with this guest walk + * through the audit mechanisms */ +static void sh_audit_gw(struct vcpu *v, walk_t *gw) +{ + mfn_t smfn; + + if ( !(SHADOW_AUDIT_ENABLE) ) + return; + +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + if ( valid_mfn(gw->l4mfn) + && valid_mfn((smfn = get_shadow_status(v, gw->l4mfn, + PGC_SH_l4_shadow))) ) + (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN)); +#endif /* PAE or 64... */ + if ( valid_mfn(gw->l3mfn) + && valid_mfn((smfn = get_shadow_status(v, gw->l3mfn, + PGC_SH_l3_shadow))) ) + (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN)); +#endif /* All levels... */ + if ( valid_mfn(gw->l2mfn) ) + { + if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn, + PGC_SH_l2_shadow))) ) + (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN)); +#if GUEST_PAGING_LEVELS == 3 + if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn, + PGC_SH_l2h_shadow))) ) + (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN)); +#endif + } + if ( valid_mfn(gw->l1mfn) + && valid_mfn((smfn = get_shadow_status(v, gw->l1mfn, + PGC_SH_l1_shadow))) ) + (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN)); + else if ( gw->l2e + && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) + && valid_mfn( + (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) ) + (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN)); +} + +#else +#define sh_audit_gw(_v, _gw) do {} while(0) +#endif /* audit code */ + + + +/**************************************************************************/ +/* Function to write to the guest tables, for propagating accessed and + * dirty bits from the shadow to the guest. + * Takes a guest mfn, a pointer to the guest entry, the level of pagetable, + * and an operation type. The guest entry is always passed as an l1e: + * since we only ever write flags, that's OK. + * Returns the new flag bits of the guest entry. */ + +static u32 guest_set_ad_bits(struct vcpu *v, + mfn_t gmfn, + guest_l1e_t *ep, + unsigned int level, + fetch_type_t ft) +{ + u32 flags, shflags, bit; + struct page_info *pg; + int res = 0; + + ASSERT(valid_mfn(gmfn) + && (sh_mfn_is_a_page_table(gmfn) + || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) + == 0))); + ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1))); + ASSERT(level <= GUEST_PAGING_LEVELS); + ASSERT(ft == ft_demand_read || ft == ft_demand_write); + ASSERT(shadow_lock_is_acquired(v->domain)); + + flags = guest_l1e_get_flags(*ep); + + /* PAE l3s do not have A and D bits */ + if ( unlikely(GUEST_PAGING_LEVELS == 3 && level == 3) ) + return flags; + + /* Need the D bit as well for writes, in l1es and 32bit/PAE PSE l2es. */ + if ( ft == ft_demand_write + && (level == 1 || + (level == 2 && GUEST_PAGING_LEVELS < 4 + && (flags & _PAGE_PSE) && guest_supports_superpages(v))) ) + { + if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) + == (_PAGE_DIRTY | _PAGE_ACCESSED) ) + return flags; /* Guest already has A and D bits set */ + flags |= _PAGE_DIRTY | _PAGE_ACCESSED; + perfc_incrc(shadow_ad_update); + } + else + { + if ( flags & _PAGE_ACCESSED ) + return flags; /* Guest already has A bit set */ + flags |= _PAGE_ACCESSED; + perfc_incrc(shadow_a_update); + } + + /* Set the bit(s) */ + sh_mark_dirty(v->domain, gmfn); + SHADOW_DEBUG(A_AND_D, "gfn = %"SH_PRI_gfn", " + "old flags = %#x, new flags = %#x\n", + guest_l1e_get_gfn(*ep), guest_l1e_get_flags(*ep), flags); + *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags); + + /* May need to propagate this change forward to other kinds of shadow */ + pg = mfn_to_page(gmfn); + if ( !sh_mfn_is_a_page_table(gmfn) ) + { + /* This guest pagetable is not yet shadowed at all. */ + // MAF: I think this assert is busted... If this gmfn has not yet + // been promoted, then it seems perfectly reasonable for there to be + // outstanding type refs to it... + /* TJD: No. If the gmfn has not been promoted, we must at least + * have recognised that it is a pagetable, and pulled write access. + * The type count should only be non-zero if it is actually a page + * table. The test above was incorrect, though, so I've fixed it. */ + ASSERT((pg->u.inuse.type_info & PGT_count_mask) == 0); + return flags; + } + + shflags = pg->shadow_flags & SHF_page_type_mask; + while ( shflags ) + { + bit = find_first_set_bit(shflags); + ASSERT(shflags & (1u << bit)); + shflags &= ~(1u << bit); + if ( !(pg->shadow_flags & (1u << bit)) ) + continue; + switch ( bit ) + { + case PGC_SH_type_to_index(PGC_SH_l1_shadow): + if (level != 1) + res |= sh_map_and_validate_gl1e(v, gmfn, ep, sizeof (*ep)); + break; + case PGC_SH_type_to_index(PGC_SH_l2_shadow): + if (level != 2) + res |= sh_map_and_validate_gl2e(v, gmfn, ep, sizeof (*ep)); + break; +#if GUEST_PAGING_LEVELS == 3 /* PAE only */ + case PGC_SH_type_to_index(PGC_SH_l2h_shadow): + if (level != 2) + res |= sh_map_and_validate_gl2he(v, gmfn, ep, sizeof (*ep)); + break; +#endif +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ + case PGC_SH_type_to_index(PGC_SH_l3_shadow): + if (level != 3) + res |= sh_map_and_validate_gl3e(v, gmfn, ep, sizeof (*ep)); + break; +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + case PGC_SH_type_to_index(PGC_SH_l4_shadow): + if (level != 4) + res |= sh_map_and_validate_gl4e(v, gmfn, ep, sizeof (*ep)); + break; +#endif +#endif + default: + SHADOW_ERROR("mfn %"SH_PRI_mfn" is shadowed in multiple " + "modes: A&D bits may be out of sync (flags=%#x).\n", + mfn_x(gmfn), pg->shadow_flags); + /* XXX Shadows in other modes will not be updated, so will + * have their A and D bits out of sync. */ + } + } + + /* We should never need to flush the TLB or recopy PAE entries */ + ASSERT( res == 0 || res == SHADOW_SET_CHANGED ); + return flags; +} + +/**************************************************************************/ +/* Functions to compute the correct index into a shadow page, given an + * index into the guest page (as returned by guest_get_index()). + * This is trivial when the shadow and guest use the same sized PTEs, but + * gets more interesting when those sizes are mismatched (e.g. 32-bit guest, + * PAE- or 64-bit shadows). + * + * These functions also increment the shadow mfn, when necessary. When PTE + * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1 + * page. In this case, we allocate 2 contiguous pages for the shadow L1, and + * use simple pointer arithmetic on a pointer to the guest L1e to figure out + * which shadow page we really want. Similarly, when PTE sizes are + * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest + * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address + * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address + * space.) + * + * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes + * of shadow (to store both the shadow, and the info that would normally be + * stored in page_info fields). This arrangement allows the shadow and the + * "page_info" fields to always be stored in the same page (in fact, in + * the same cache line), avoiding an extra call to map_domain_page(). + */ + +static inline u32 +guest_index(void *ptr) +{ + return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t); +} + +static inline u32 +shadow_l1_index(mfn_t *smfn, u32 guest_index) +{ +#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2) + *smfn = _mfn(mfn_x(*smfn) + + (guest_index / SHADOW_L1_PAGETABLE_ENTRIES)); + return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES); +#else + return guest_index; +#endif +} + +static inline u32 +shadow_l2_index(mfn_t *smfn, u32 guest_index) +{ +#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2) + // Because we use 2 shadow l2 entries for each guest entry, the number of + // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2 + // + *smfn = _mfn(mfn_x(*smfn) + + (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2))); + + // We multiple by two to get the index of the first of the two entries + // used to shadow the specified guest entry. + return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2; +#else + return guest_index; +#endif +} + +#if GUEST_PAGING_LEVELS >= 3 + +static inline u32 +shadow_l3_index(mfn_t *smfn, u32 guest_index) +{ +#if GUEST_PAGING_LEVELS == 3 + u32 group_id; + + // Because we use twice the space in L3 shadows as was consumed in guest + // L3s, the number of guest entries per shadow page is + // SHADOW_L2_PAGETABLE_ENTRIES/2. (Note this is *not* + // SHADOW_L3_PAGETABLE_ENTRIES, which in this case is 4...) + // + *smfn = _mfn(mfn_x(*smfn) + + (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2))); + + // We store PAE L3 shadows in groups of 4, alternating shadows and + // pae_l3_bookkeeping structs. So the effective shadow index is + // the the group_id * 8 + the offset within the group. + // + guest_index %= (SHADOW_L2_PAGETABLE_ENTRIES / 2); + group_id = guest_index / 4; + return (group_id * 8) + (guest_index % 4); +#else + return guest_index; +#endif +} + +#endif // GUEST_PAGING_LEVELS >= 3 + +#if GUEST_PAGING_LEVELS >= 4 + +static inline u32 +shadow_l4_index(mfn_t *smfn, u32 guest_index) +{ + return guest_index; +} + +#endif // GUEST_PAGING_LEVELS >= 4 + + +/**************************************************************************/ +/* Functions which compute shadow entries from their corresponding guest + * entries. + * + * These are the "heart" of the shadow code. + * + * There are two sets of these: those that are called on demand faults (read + * faults and write faults), and those that are essentially called to + * "prefetch" (or propagate) entries from the guest into the shadow. The read + * fault and write fault are handled as two separate cases for L1 entries (due + * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together + * into the respective demand_fault functions. + */ + +#define CHECK(_cond) \ +do { \ + if (unlikely(!(_cond))) \ + { \ + printk("%s %s %d ASSERTION (%s) FAILED\n", \ + __func__, __FILE__, __LINE__, #_cond); \ + return -1; \ + } \ +} while (0); + +// The function below tries to capture all of the flag manipulation for the +// demand and propagate functions into one place. +// +static always_inline u32 +sh_propagate_flags(struct vcpu *v, mfn_t target_mfn, + u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn, + int mmio, int level, fetch_type_t ft) +{ + struct domain *d = v->domain; + u32 pass_thru_flags; + u32 sflags; + + // XXX -- might want to think about PAT support for HVM guests... + +#ifndef NDEBUG + // MMIO can only occur from L1e's + // + if ( mmio ) + CHECK(level == 1); + + // We should always have a pointer to the guest entry if it's a non-PSE + // non-MMIO demand access. + if ( ft & FETCH_TYPE_DEMAND ) + CHECK(guest_entry_ptr || level == 1); +#endif + + // A not-present guest entry has a special signature in the shadow table, + // so that we do not have to consult the guest tables multiple times... + // + if ( unlikely(!(gflags & _PAGE_PRESENT)) ) + return _PAGE_SHADOW_GUEST_NOT_PRESENT; + + // Must have a valid target_mfn, unless this is mmio, or unless this is a + // prefetch. In the case of a prefetch, an invalid mfn means that we can + // not usefully shadow anything, and so we return early. + // + if ( !valid_mfn(target_mfn) ) + { + CHECK((ft == ft_prefetch) || mmio); + if ( !mmio ) + return 0; + } + + // PAE does not allow NX, RW, USER, ACCESSED, or DIRTY bits in its L3e's... + // + if ( (SHADOW_PAGING_LEVELS == 3) && (level == 3) ) + pass_thru_flags = _PAGE_PRESENT; + else + { + pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER | + _PAGE_RW | _PAGE_PRESENT); + if ( guest_supports_nx(v) ) + pass_thru_flags |= _PAGE_NX_BIT; + } + + // PAE guests can not put NX, RW, USER, ACCESSED, or DIRTY bits into their + // L3e's; they are all implied. So we emulate them here. + // + if ( (GUEST_PAGING_LEVELS == 3) && (level == 3) ) + gflags = pass_thru_flags; + + // Propagate bits from the guest to the shadow. + // Some of these may be overwritten, below. + // Since we know the guest's PRESENT bit is set, we also set the shadow's + // SHADOW_PRESENT bit. + // + sflags = (gflags & pass_thru_flags) | _PAGE_SHADOW_PRESENT; + + // Copy the guest's RW bit into the SHADOW_RW bit. + // + if ( gflags & _PAGE_RW ) + sflags |= _PAGE_SHADOW_RW; + + // Set the A&D bits for higher level shadows. + // Higher level entries do not, strictly speaking, have dirty bits, but + // since we use shadow linear tables, each of these entries may, at some + // point in time, also serve as a shadow L1 entry. + // By setting both the A&D bits in each of these, we eliminate the burden + // on the hardware to update these bits on initial accesses. + // + if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) ) + sflags |= _PAGE_ACCESSED | _PAGE_DIRTY; + + + // Set the A and D bits in the guest entry, if we need to. + if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) ) + gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft); + + // If the A or D bit has not yet been set in the guest, then we must + // prevent the corresponding kind of access. + // + if ( unlikely(!((GUEST_PAGING_LEVELS == 3) && (level == 3)) && + !(gflags & _PAGE_ACCESSED)) ) + sflags &= ~_PAGE_PRESENT; + + /* D bits exist in l1es, and 32bit/PAE PSE l2es, but not 64bit PSE l2es */ + if ( unlikely( ((level == 1) + || ((level == 2) && (GUEST_PAGING_LEVELS < 4) + && guest_supports_superpages(v) && + (gflags & _PAGE_PSE))) + && !(gflags & _PAGE_DIRTY)) ) + sflags &= ~_PAGE_RW; + + // MMIO caching + // + // MMIO mappings are marked as not present, but we set the SHADOW_MMIO bit + // to cache the fact that this entry is in MMIO space. + // + if ( (level == 1) && mmio ) + { + sflags &= ~(_PAGE_PRESENT); + sflags |= _PAGE_SHADOW_MMIO; + } + else + { + // shadow_mode_log_dirty support + // + // Only allow the guest write access to a page a) on a demand fault, + // or b) if the page is already marked as dirty. + // + if ( unlikely((level == 1) && + !(ft & FETCH_TYPE_WRITE) && + shadow_mode_log_dirty(d) && + !sh_mfn_is_dirty(d, target_mfn)) ) + { + sflags &= ~_PAGE_RW; + } + + // protect guest page tables + // + if ( unlikely((level == 1) && + sh_mfn_is_a_page_table(target_mfn)) ) + { + if ( shadow_mode_trap_reads(d) ) + { + // if we are trapping both reads & writes, then mark this page + // as not present... + // + sflags &= ~_PAGE_PRESENT; + } + else + { + // otherwise, just prevent any writes... + // + sflags &= ~_PAGE_RW; + } + } + } + + return sflags; +} + +#undef CHECK + +#if GUEST_PAGING_LEVELS >= 4 +static void +l4e_propagate_from_guest(struct vcpu *v, + guest_l4e_t *gl4e, + mfn_t gl4mfn, + mfn_t sl3mfn, + shadow_l4e_t *sl4p, + fetch_type_t ft) +{ + u32 gflags = guest_l4e_get_flags(*gl4e); + u32 sflags = sh_propagate_flags(v, sl3mfn, gflags, (guest_l1e_t *) gl4e, + gl4mfn, 0, 4, ft); + + *sl4p = shadow_l4e_from_mfn(sl3mfn, sflags); + + SHADOW_DEBUG(PROPAGATE, + "%s gl4e=%" SH_PRI_gpte " sl4e=%" SH_PRI_pte "\n", + fetch_type_names[ft], gl4e->l4, sl4p->l4); + ASSERT(sflags != -1); +} +#endif // GUEST_PAGING_LEVELS >= 4 + +#if GUEST_PAGING_LEVELS >= 3 +static void +l3e_propagate_from_guest(struct vcpu *v, + guest_l3e_t *gl3e, + mfn_t gl3mfn, + mfn_t sl2mfn, + shadow_l3e_t *sl3p, + fetch_type_t ft) +{ + u32 gflags = guest_l3e_get_flags(*gl3e); + u32 sflags = sh_propagate_flags(v, sl2mfn, gflags, (guest_l1e_t *) gl3e, + gl3mfn, 0, 3, ft); + + *sl3p = shadow_l3e_from_mfn(sl2mfn, sflags); + + SHADOW_DEBUG(PROPAGATE, + "%s gl3e=%" SH_PRI_gpte " sl3e=%" SH_PRI_pte "\n", + fetch_type_names[ft], gl3e->l3, sl3p->l3); + ASSERT(sflags != -1); +} +#endif // GUEST_PAGING_LEVELS >= 3 + +static void +l2e_propagate_from_guest(struct vcpu *v, + guest_l2e_t *gl2e, + mfn_t gl2mfn, + mfn_t sl1mfn, + shadow_l2e_t *sl2p, + fetch_type_t ft) +{ + u32 gflags = guest_l2e_get_flags(*gl2e); + u32 sflags = sh_propagate_flags(v, sl1mfn, gflags, (guest_l1e_t *) gl2e, + gl2mfn, 0, 2, ft); + + *sl2p = shadow_l2e_from_mfn(sl1mfn, sflags); + + SHADOW_DEBUG(PROPAGATE, + "%s gl2e=%" SH_PRI_gpte " sl2e=%" SH_PRI_pte "\n", + fetch_type_names[ft], gl2e->l2, sl2p->l2); + ASSERT(sflags != -1); +} + +static inline int +l1e_read_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p, + int mmio) +/* returns 1 if emulation is required, and 0 otherwise */ +{ + struct domain *d = v->domain; + u32 gflags = guest_l1e_get_flags(gw->eff_l1e); + u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn, + mmio, 1, ft_demand_read); + + if ( shadow_mode_trap_reads(d) && !mmio && sh_mfn_is_a_page_table(gmfn) ) + { + // emulation required! + *sl1p = shadow_l1e_empty(); + return 1; + } + + *sl1p = shadow_l1e_from_mfn(gmfn, sflags); + + SHADOW_DEBUG(PROPAGATE, + "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n", + (void *)gw->va, gw->eff_l1e.l1, sl1p->l1); + + ASSERT(sflags != -1); + return 0; +} + +static inline int +l1e_write_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p, + int mmio) +/* returns 1 if emulation is required, and 0 otherwise */ +{ + struct domain *d = v->domain; + u32 gflags = guest_l1e_get_flags(gw->eff_l1e); + u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn, + mmio, 1, ft_demand_write); + + sh_mark_dirty(d, gmfn); + + if ( !mmio && sh_mfn_is_a_page_table(gmfn) ) + { + // emulation required! + *sl1p = shadow_l1e_empty(); + return 1; + } + + *sl1p = shadow_l1e_from_mfn(gmfn, sflags); + + SHADOW_DEBUG(PROPAGATE, + "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n", + (void *)gw->va, gw->eff_l1e.l1, sl1p->l1); + + ASSERT(sflags != -1); + return 0; +} + +static inline void +l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, shadow_l1e_t *sl1p, + int mmio) +{ + gfn_t gfn = guest_l1e_get_gfn(gl1e); + mfn_t gmfn = (mmio) ? _mfn(gfn_x(gfn)) : vcpu_gfn_to_mfn(v, gfn); + u32 gflags = guest_l1e_get_flags(gl1e); + u32 sflags = sh_propagate_flags(v, gmfn, gflags, 0, _mfn(INVALID_MFN), + mmio, 1, ft_prefetch); + + *sl1p = shadow_l1e_from_mfn(gmfn, sflags); + + SHADOW_DEBUG(PROPAGATE, + "gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n", + gl1e.l1, sl1p->l1); + + ASSERT(sflags != -1); +} + + +/**************************************************************************/ +/* These functions update shadow entries (and do bookkeeping on the shadow + * tables they are in). It is intended that they are the only + * functions which ever write (non-zero) data onto a shadow page. + * + * They return a set of flags: + * SHADOW_SET_CHANGED -- we actually wrote a new value to the shadow. + * SHADOW_SET_FLUSH -- the caller must cause a TLB flush. + * SHADOW_SET_ERROR -- the input is not a valid entry (for example, if + * shadow_get_page_from_l1e() fails). + * SHADOW_SET_L3PAE_RECOPY -- one or more vcpu's need to have their local + * copies of their PAE L3 entries re-copied. + */ + +static inline void safe_write_entry(void *dst, void *src) +/* Copy one PTE safely when processors might be running on the + * destination pagetable. This does *not* give safety against + * concurrent writes (that's what the shadow lock is for), just + * stops the hardware picking up partially written entries. */ +{ + volatile unsigned long *d = dst; + unsigned long *s = src; + ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1))); +#if CONFIG_PAGING_LEVELS == 3 + /* In PAE mode, pagetable entries are larger + * than machine words, so won't get written atomically. We need to make + * sure any other cpu running on these shadows doesn't see a + * half-written entry. Do this by marking the entry not-present first, + * then writing the high word before the low word. */ + BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long)); + d[0] = 0; + d[1] = s[1]; + d[0] = s[0]; +#else + /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word, + * which will be an atomic write, since the entry is aligned. */ + BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long)); + *d = *s; +#endif +} + + +static inline void +shadow_write_entries(void *d, void *s, int entries, mfn_t mfn) +/* This function does the actual writes to shadow pages. + * It must not be called directly, since it doesn't do the bookkeeping + * that shadow_set_l*e() functions do. */ +{ + shadow_l1e_t *dst = d; + shadow_l1e_t *src = s; + void *map = NULL; + int i; + + /* Because we mirror access rights at all levels in the shadow, an + * l2 (or higher) entry with the RW bit cleared will leave us with + * no write access through the linear map. + * We detect that by writing to the shadow with copy_to_user() and + * using map_domain_page() to get a writeable mapping if we need to. */ + if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 ) + { + perfc_incrc(shadow_linear_map_failed); + map = sh_map_domain_page(mfn); + ASSERT(map != NULL); + dst = map + ((unsigned long)dst & (PAGE_SIZE - 1)); + } + + + for ( i = 0; i < entries; i++ ) + safe_write_entry(dst++, src++); + + if ( map != NULL ) sh_unmap_domain_page(map); + + /* XXX TODO: + * Update min/max field in page_info struct of this mfn */ +} + +static inline int +perms_strictly_increased(u32 old_flags, u32 new_flags) +/* Given the flags of two entries, are the new flags a strict + * increase in rights over the old ones? */ +{ + u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX); + u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX); + /* Flip the NX bit, since it's the only one that decreases rights; + * we calculate as if it were an "X" bit. */ + of ^= _PAGE_NX_BIT; + nf ^= _PAGE_NX_BIT; + /* If the changed bits are all set in the new flags, then rights strictly + * increased between old and new. */ + return ((of | (of ^ nf)) == nf); +} + +static int inline +shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d) +{ + int res; + mfn_t mfn; + struct domain *owner; + shadow_l1e_t sanitized_sl1e = + shadow_l1e_remove_flags(sl1e, _PAGE_SHADOW_RW | _PAGE_SHADOW_PRESENT); + + //ASSERT(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT); + //ASSERT((shadow_l1e_get_flags(sl1e) & L1_DISALLOW_MASK) == 0); + + if ( !shadow_mode_refcounts(d) ) + return 1; + + res = get_page_from_l1e(sanitized_sl1e, d); + + // If a privileged domain is attempting to install a map of a page it does + // not own, we let it succeed anyway. + // + if ( unlikely(!res) && + IS_PRIV(d) && + !shadow_mode_translate(d) && + valid_mfn(mfn = shadow_l1e_get_mfn(sl1e)) && + (owner = page_get_owner(mfn_to_page(mfn))) && + (d != owner) ) + { + res = get_page_from_l1e(sanitized_sl1e, owner); + SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx " + "which is owned by domain %d: %s\n", + d->domain_id, mfn_x(mfn), owner->domain_id, + res ? "success" : "failed"); + } + + if ( unlikely(!res) ) + { + perfc_incrc(shadow_get_page_fail); + SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n"); + } + + return res; +} + +static void inline +shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d) +{ + if ( !shadow_mode_refcounts(d) ) + return; + + put_page_from_l1e(sl1e, d); +} + +#if GUEST_PAGING_LEVELS >= 4 +static int shadow_set_l4e(struct vcpu *v, + shadow_l4e_t *sl4e, + shadow_l4e_t new_sl4e, + mfn_t sl4mfn) +{ + int flags = 0; + shadow_l4e_t old_sl4e; + paddr_t paddr; + ASSERT(sl4e != NULL); + old_sl4e = *sl4e; + + if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */ + + paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) + | (((unsigned long)sl4e) & ~PAGE_MASK)); + + if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT ) + { + /* About to install a new reference */ + sh_get_ref(shadow_l4e_get_mfn(new_sl4e), paddr); + } + + /* Write the new entry */ + shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn); + flags |= SHADOW_SET_CHANGED; + + if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT ) + { + /* We lost a reference to an old mfn. */ + mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e); + if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e))) + || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e), + shadow_l4e_get_flags(new_sl4e)) ) + { + flags |= SHADOW_SET_FLUSH; + } + sh_put_ref(v, osl3mfn, paddr); + } + return flags; +} +#endif /* GUEST_PAGING_LEVELS >= 4 */ + +#if GUEST_PAGING_LEVELS >= 3 +static int shadow_set_l3e(struct vcpu *v, + shadow_l3e_t *sl3e, + shadow_l3e_t new_sl3e, + mfn_t sl3mfn) +{ + int flags = 0; + shadow_l3e_t old_sl3e; + paddr_t paddr; + ASSERT(sl3e != NULL); + old_sl3e = *sl3e; + + if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */ + + paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) + | (((unsigned long)sl3e) & ~PAGE_MASK)); + + if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT ) + { + /* About to install a new reference */ + sh_get_ref(shadow_l3e_get_mfn(new_sl3e), paddr); + } + + /* Write the new entry */ + shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn); + flags |= SHADOW_SET_CHANGED; + +#if GUEST_PAGING_LEVELS == 3 + /* We wrote a guest l3e in a PAE pagetable. This table is copied in + * the linear pagetable entries of its l2s, and may also be copied + * to a low memory location to make it fit in CR3. Report that we + * need to resync those copies (we can't wait for the guest to flush + * the TLB because it might be an increase in rights). */ + { + struct vcpu *vcpu; + + struct pae_l3_bookkeeping *info = sl3p_to_info(sl3e); + for_each_vcpu(v->domain, vcpu) + { + if (info->vcpus & (1 << vcpu->vcpu_id)) + { + // Remember that this flip/update needs to occur. + vcpu->arch.shadow.pae_flip_pending = 1; + flags |= SHADOW_SET_L3PAE_RECOPY; + } + } + } +#endif + + if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT ) + { + /* We lost a reference to an old mfn. */ + mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e); + if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) || + !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e), + shadow_l3e_get_flags(new_sl3e)) ) + { + flags |= SHADOW_SET_FLUSH; + } + sh_put_ref(v, osl2mfn, paddr); + } + return flags; +} +#endif /* GUEST_PAGING_LEVELS >= 3 */ + +static int shadow_set_l2e(struct vcpu *v, + shadow_l2e_t *sl2e, + shadow_l2e_t new_sl2e, + mfn_t sl2mfn) +{ + int flags = 0; + shadow_l2e_t old_sl2e; + paddr_t paddr; + +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2 + /* In 2-on-3 we work with pairs of l2es pointing at two-page + * shadows. Reference counting and up-pointers track from the first + * page of the shadow to the first l2e, so make sure that we're + * working with those: + * Align the pointer down so it's pointing at the first of the pair */ + sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t))); + /* Align the mfn of the shadow entry too */ + new_sl2e.l2 &= ~(1< 2 + { + shadow_l2e_t pair[2] = { new_sl2e, new_sl2e }; + /* The l1 shadow is two pages long and need to be pointed to by + * two adjacent l1es. The pair have the same flags, but point + * at odd and even MFNs */ + ASSERT(!(pair[0].l2 & (1<domain; + shadow_l1e_t old_sl1e; + ASSERT(sl1e != NULL); + + old_sl1e = *sl1e; + + if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */ + + if ( shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT ) + { + /* About to install a new reference */ + if ( shadow_mode_refcounts(d) ) { + if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 ) + { + /* Doesn't look like a pagetable. */ + flags |= SHADOW_SET_ERROR; + new_sl1e = shadow_l1e_empty(); + } + } + } + + /* Write the new entry */ + shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn); + flags |= SHADOW_SET_CHANGED; + + if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT ) + { + /* We lost a reference to an old mfn. */ + /* N.B. Unlike higher-level sets, never need an extra flush + * when writing an l1e. Because it points to the same guest frame + * as the guest l1e did, it's the guest's responsibility to + * trigger a flush later. */ + if ( shadow_mode_refcounts(d) ) + { + shadow_put_page_from_l1e(old_sl1e, d); + } + } + return flags; +} + + +/**************************************************************************/ +/* These functions take a vcpu and a virtual address, and return a pointer + * to the appropriate level N entry from the shadow tables. + * If the necessary tables are not present in the shadow, they return NULL. */ + +/* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has + * more levels than the guest, the upper levels are always fixed and do not + * reflect any information from the guest, so we do not use these functions + * to access them. */ + +#if GUEST_PAGING_LEVELS >= 4 +static shadow_l4e_t * +shadow_get_l4e(struct vcpu *v, unsigned long va) +{ + /* Reading the top level table is always valid. */ + return sh_linear_l4_table(v) + shadow_l4_linear_offset(va); +} +#endif /* GUEST_PAGING_LEVELS >= 4 */ + + +#if GUEST_PAGING_LEVELS >= 3 +static shadow_l3e_t * +shadow_get_l3e(struct vcpu *v, unsigned long va) +{ +#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */ + /* Get the l4 */ + shadow_l4e_t *sl4e = shadow_get_l4e(v, va); + ASSERT(sl4e != NULL); + if ( !(shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT) ) + return NULL; + ASSERT(valid_mfn(shadow_l4e_get_mfn(*sl4e))); + /* l4 was present; OK to get the l3 */ + return sh_linear_l3_table(v) + shadow_l3_linear_offset(va); +#else /* PAE... */ + /* Top level is always mapped */ + ASSERT(v->arch.shadow_vtable); + return ((shadow_l3e_t *)v->arch.shadow_vtable) + shadow_l3_linear_offset(va); +#endif +} +#endif /* GUEST_PAGING_LEVELS >= 3 */ + + +static shadow_l2e_t * +shadow_get_l2e(struct vcpu *v, unsigned long va) +{ +#if GUEST_PAGING_LEVELS >= 3 /* 64bit/PAE... */ + /* Get the l3 */ + shadow_l3e_t *sl3e = shadow_get_l3e(v, va); + if ( sl3e == NULL || !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) + return NULL; + ASSERT(valid_mfn(shadow_l3e_get_mfn(*sl3e))); + /* l3 was present; OK to get the l2 */ +#endif + return sh_linear_l2_table(v) + shadow_l2_linear_offset(va); +} + + +#if 0 // avoid the compiler warning for now... + +static shadow_l1e_t * +shadow_get_l1e(struct vcpu *v, unsigned long va) +{ + /* Get the l2 */ + shadow_l2e_t *sl2e = shadow_get_l2e(v, va); + if ( sl2e == NULL || !(shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT) ) + return NULL; + ASSERT(valid_mfn(shadow_l2e_get_mfn(*sl2e))); + /* l2 was present; OK to get the l1 */ + return sh_linear_l1_table(v) + shadow_l1_linear_offset(va); +} + +#endif + + +/**************************************************************************/ +/* Macros to walk pagetables. These take the shadow of a pagetable and + * walk every "interesting" entry. That is, they don't touch Xen mappings, + * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every + * second entry (since pairs of entries are managed together). For multi-page + * shadows they walk all pages. + * + * Arguments are an MFN, the variable to point to each entry, a variable + * to indicate that we are done (we will shortcut to the end of the scan + * when _done != 0), a variable to indicate that we should avoid Xen mappings, + * and the code. + * + * WARNING: These macros have side-effects. They change the values of both + * the pointer and the MFN. */ + +static inline void increment_ptr_to_guest_entry(void *ptr) +{ + if ( ptr ) + { + guest_l1e_t **entry = ptr; + (*entry)++; + } +} + +/* All kinds of l1: touch all entries */ +#define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ +do { \ + int _i; \ + shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \ + ASSERT((mfn_to_page(_sl1mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l1_shadow \ + || (mfn_to_page(_sl1mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_fl1_shadow); \ + for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \ + { \ + (_sl1e) = _sp + _i; \ + if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + increment_ptr_to_guest_entry(_gl1p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +/* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */ +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2 +#define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ +do { \ + int __done = 0; \ + _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \ + ({ (__done = _done); }), _code); \ + _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \ + if ( !__done ) \ + _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \ + ({ (__done = _done); }), _code); \ +} while (0) +#else /* Everything else; l1 shadows are only one page */ +#define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ + _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) +#endif + + +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2 + +/* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */ +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ +do { \ + int _i, _j, __done = 0; \ + ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l2_32_shadow); \ + for ( _j = 0; _j < 4 && !__done; _j++ ) \ + { \ + shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \ + for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \ + if ( (!(_xen)) \ + || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \ + < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \ + { \ + (_sl2e) = _sp + _i; \ + if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( (__done = (_done)) ) break; \ + increment_ptr_to_guest_entry(_gl2p); \ + } \ + unmap_shadow_page(_sp); \ + _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \ + } \ +} while (0) + +#elif GUEST_PAGING_LEVELS == 2 + +/* 32-bit on 32-bit: avoid Xen entries */ +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ +do { \ + int _i; \ + shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \ + ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l2_32_shadow); \ + for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \ + if ( (!(_xen)) \ + || \ + (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \ + { \ + (_sl2e) = _sp + _i; \ + if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + increment_ptr_to_guest_entry(_gl2p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +#elif GUEST_PAGING_LEVELS == 3 + +/* PAE: if it's an l2h, don't touch Xen mappings */ +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ +do { \ + int _i; \ + shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \ + ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l2_pae_shadow \ + || (mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l2h_pae_shadow); \ + for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \ + if ( (!(_xen)) \ + || ((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \ + != PGC_SH_l2h_pae_shadow) \ + || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \ + < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \ + { \ + (_sl2e) = _sp + _i; \ + if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + increment_ptr_to_guest_entry(_gl2p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +#else + +/* 64-bit l2: touch all entries */ +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ +do { \ + int _i; \ + shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \ + ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l2_64_shadow); \ + for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \ + { \ + (_sl2e) = _sp + _i; \ + if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + increment_ptr_to_guest_entry(_gl2p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +#endif /* different kinds of l2 */ + +#if GUEST_PAGING_LEVELS == 3 + +/* PAE l3 subshadow: touch all entries (FOREACH_L2E will find Xen l2es). */ +#define SHADOW_FOREACH_L3E_SUB(_sl3e, _gl3p, _done, _code) \ +do { \ + int _i; \ + for ( _i = 0; _i < 4; _i++ ) \ + { \ + if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + _sl3e++; \ + increment_ptr_to_guest_entry(_gl3p); \ + } \ +} while (0) + +/* PAE l3 full shadow: call subshadow walk on all valid l3 subshadows */ +#define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \ +do { \ + int _i, _j, _k, __done = 0; \ + ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l3_pae_shadow); \ + /* The subshadows are split, 64 on each page of the shadow */ \ + for ( _j = 0; _j < 2 && !__done; _j++ ) \ + { \ + void *_sp = sh_map_domain_page(_sl3mfn); \ + for ( _i = 0; _i < 64; _i++ ) \ + { \ + /* Every second 32-byte region is a bookkeeping entry */ \ + _sl3e = (shadow_l3e_t *)(_sp + (64 * _i)); \ + if ( (sl3p_to_info(_sl3e))->refcount > 0 ) \ + SHADOW_FOREACH_L3E_SUB(_sl3e, _gl3p, \ + ({ __done = (_done); __done; }), \ + _code); \ + else \ + for ( _k = 0 ; _k < 4 ; _k++ ) \ + increment_ptr_to_guest_entry(_gl3p); \ + if ( __done ) break; \ + } \ + sh_unmap_domain_page(_sp); \ + _sl3mfn = _mfn(mfn_x(_sl3mfn) + 1); \ + } \ +} while (0) + +#elif GUEST_PAGING_LEVELS == 4 + +/* 64-bit l3: touch all entries */ +#define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \ +do { \ + int _i; \ + shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \ + ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l3_64_shadow); \ + for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \ + { \ + (_sl3e) = _sp + _i; \ + if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + increment_ptr_to_guest_entry(_gl3p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +/* 64-bit l4: avoid Xen mappings */ +#define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code) \ +do { \ + int _i; \ + shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \ + ASSERT((mfn_to_page(_sl4mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l4_64_shadow); \ + for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \ + { \ + if ( (!(_xen)) || is_guest_l4_slot(_i) ) \ + { \ + (_sl4e) = _sp + _i; \ + if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + } \ + increment_ptr_to_guest_entry(_gl4p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +#endif + + + +/**************************************************************************/ +/* Functions to install Xen mappings and linear mappings in shadow pages */ + +static mfn_t sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type); + +// XXX -- this function should probably be moved to shadow-common.c, but that +// probably wants to wait until the shadow types have been moved from +// shadow-types.h to shadow-private.h +// +#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4 +void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn) +{ + struct domain *d = v->domain; + shadow_l4e_t *sl4e; + + sl4e = sh_map_domain_page(sl4mfn); + ASSERT(sl4e != NULL); + ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t)); + + /* Copy the common Xen mappings from the idle domain */ + memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT], + &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT], + ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t)); + + /* Install the per-domain mappings for this domain */ + sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] = + shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)), + __PAGE_HYPERVISOR); + + /* Linear mapping */ + sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] = + shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR); + sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] = + shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR); + + if ( shadow_mode_translate(v->domain) ) + { + /* install domain-specific P2M table */ + sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] = + shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table), + __PAGE_HYPERVISOR); + } + + sh_unmap_domain_page(sl4e); +} +#endif + +#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3 +// For 3-on-3 PV guests, we need to make sure the xen mappings are in +// place, which means that we need to populate the l2h entry in the l3 +// table. + +void sh_install_xen_entries_in_l2h(struct vcpu *v, + mfn_t sl2hmfn) +{ + struct domain *d = v->domain; + shadow_l2e_t *sl2e; + int i; + + sl2e = sh_map_domain_page(sl2hmfn); + ASSERT(sl2e != NULL); + ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t)); + + /* Copy the common Xen mappings from the idle domain */ + memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)], + &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT], + L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t)); + + /* Install the per-domain mappings for this domain */ + for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) + sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] = + shadow_l2e_from_mfn( + page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i), + __PAGE_HYPERVISOR); + + /* We don't set up a linear mapping here because we can't until this + * l2h is installed in an l3e. sh_update_linear_entries() handles + * the linear mappings when the l3 is loaded. */ + + if ( shadow_mode_translate(d) ) + { + /* Install the domain-specific p2m table */ + l3_pgentry_t *p2m; + ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0); + p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); + for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ ) + { + sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] = + shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])), + __PAGE_HYPERVISOR); + } + sh_unmap_domain_page(p2m); + } + + sh_unmap_domain_page(sl2e); +} + +void sh_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn) +{ + shadow_l3e_t *sl3e; + guest_l3e_t *gl3e = v->arch.guest_vtable; + shadow_l3e_t new_sl3e; + gfn_t l2gfn; + mfn_t l2gmfn, l2smfn; + int r; + + ASSERT(!shadow_mode_external(v->domain)); + ASSERT(guest_l3e_get_flags(gl3e[3]) & _PAGE_PRESENT); + l2gfn = guest_l3e_get_gfn(gl3e[3]); + l2gmfn = sh_gfn_to_mfn(v->domain, gfn_x(l2gfn)); + l2smfn = get_shadow_status(v, l2gmfn, PGC_SH_l2h_shadow); + if ( !valid_mfn(l2smfn) ) + { + l2smfn = sh_make_shadow(v, l2gmfn, PGC_SH_l2h_shadow); + } + l3e_propagate_from_guest(v, &gl3e[3], gl3mfn, l2smfn, &new_sl3e, + ft_prefetch); + sl3e = sh_map_domain_page(sl3mfn); + r = shadow_set_l3e(v, &sl3e[3], new_sl3e, sl3mfn); + sh_unmap_domain_page(sl3e); +} +#endif + + +#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2 +void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn) +{ + struct domain *d = v->domain; + shadow_l2e_t *sl2e; + int i; + + sl2e = sh_map_domain_page(sl2mfn); + ASSERT(sl2e != NULL); + ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t)); + + /* Copy the common Xen mappings from the idle domain */ + memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT], + &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT], + L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t)); + + /* Install the per-domain mappings for this domain */ + for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) + sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] = + shadow_l2e_from_mfn( + page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i), + __PAGE_HYPERVISOR); + + /* Linear mapping */ + sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] = + shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR); + sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] = + shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR); + + if ( shadow_mode_translate(d) ) + { + /* install domain-specific P2M table */ + sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] = + shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table), + __PAGE_HYPERVISOR); + } + + sh_unmap_domain_page(sl2e); +} +#endif + + + + + +/**************************************************************************/ +/* Create a shadow of a given guest page. + */ +static mfn_t +sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type) +{ + mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn)); + SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n", + mfn_x(gmfn), shadow_type, mfn_x(smfn)); + + if ( shadow_type != PGC_SH_guest_root_type ) + /* Lower-level shadow, not yet linked form a higher level */ + mfn_to_page(smfn)->up = 0; + + // Create the Xen mappings... + if ( !shadow_mode_external(v->domain) ) + { + switch (shadow_type) + { +#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4 + case PGC_SH_l4_shadow: + sh_install_xen_entries_in_l4(v, gmfn, smfn); break; +#endif +#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3 + case PGC_SH_l3_shadow: + sh_install_xen_entries_in_l3(v, gmfn, smfn); break; + case PGC_SH_l2h_shadow: + sh_install_xen_entries_in_l2h(v, smfn); break; +#endif +#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2 + case PGC_SH_l2_shadow: + sh_install_xen_entries_in_l2(v, gmfn, smfn); break; +#endif + default: /* Do nothing */ break; + } + } + + shadow_promote(v, gmfn, shadow_type); + set_shadow_status(v, gmfn, shadow_type, smfn); + + return smfn; +} + +/* Make a splintered superpage shadow */ +static mfn_t +make_fl1_shadow(struct vcpu *v, gfn_t gfn) +{ + mfn_t smfn = shadow_alloc(v->domain, PGC_SH_fl1_shadow, + (unsigned long) gfn_x(gfn)); + + SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" SH_PRI_mfn "\n", + gfn_x(gfn), mfn_x(smfn)); + + set_fl1_shadow_status(v, gfn, smfn); + return smfn; +} + + +#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS +mfn_t +sh_make_monitor_table(struct vcpu *v) +{ + + ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0); + +#if CONFIG_PAGING_LEVELS == 4 + { + struct domain *d = v->domain; + mfn_t m4mfn; + m4mfn = shadow_alloc(d, PGC_SH_monitor_table, 0); + sh_install_xen_entries_in_l4(v, m4mfn, m4mfn); + /* Remember the level of this table */ + mfn_to_page(m4mfn)->shadow_flags = 4; +#if SHADOW_PAGING_LEVELS < 4 + // Install a monitor l3 table in slot 0 of the l4 table. + // This is used for shadow linear maps. + { + mfn_t m3mfn; + l4_pgentry_t *l4e; + m3mfn = shadow_alloc(d, PGC_SH_monitor_table, 0); + mfn_to_page(m3mfn)->shadow_flags = 3; + l4e = sh_map_domain_page(m4mfn); + l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR); + sh_unmap_domain_page(l4e); + } +#endif /* SHADOW_PAGING_LEVELS < 4 */ + return m4mfn; + } + +#elif CONFIG_PAGING_LEVELS == 3 + + { + struct domain *d = v->domain; + mfn_t m3mfn, m2mfn; + l3_pgentry_t *l3e; + l2_pgentry_t *l2e; + int i; + + m3mfn = shadow_alloc(d, PGC_SH_monitor_table, 0); + /* Remember the level of this table */ + mfn_to_page(m3mfn)->shadow_flags = 3; + + // Install a monitor l2 table in slot 3 of the l3 table. + // This is used for all Xen entries, including linear maps + m2mfn = shadow_alloc(d, PGC_SH_monitor_table, 0); + mfn_to_page(m2mfn)->shadow_flags = 2; + l3e = sh_map_domain_page(m3mfn); + l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT); + sh_install_xen_entries_in_l2h(v, m2mfn); + /* Install the monitor's own linear map */ + l2e = sh_map_domain_page(m2mfn); + for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) + l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] = + (l3e_get_flags(l3e[i]) & _PAGE_PRESENT) + ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR) + : l2e_empty(); + sh_unmap_domain_page(l2e); + sh_unmap_domain_page(l3e); + + SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn)); + return m3mfn; + } + +#elif CONFIG_PAGING_LEVELS == 2 + + { + struct domain *d = v->domain; + mfn_t m2mfn; + m2mfn = shadow_alloc(d, PGC_SH_monitor_table, 0); + sh_install_xen_entries_in_l2(v, m2mfn, m2mfn); + /* Remember the level of this table */ + mfn_to_page(m2mfn)->shadow_flags = 2; + return m2mfn; + } + +#else +#error this should not happen +#endif /* CONFIG_PAGING_LEVELS */ +} +#endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */ + +/**************************************************************************/ +/* These functions also take a virtual address and return the level-N + * shadow table mfn and entry, but they create the shadow pagetables if + * they are needed. The "demand" argument is non-zero when handling + * a demand fault (so we know what to do about accessed bits &c). + * If the necessary tables are not present in the guest, they return NULL. */ +#if GUEST_PAGING_LEVELS >= 4 +static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v, + walk_t *gw, + mfn_t *sl4mfn) +{ + /* There is always a shadow of the top level table. Get it. */ + *sl4mfn = pagetable_get_mfn(v->arch.shadow_table); + /* Reading the top level table is always valid. */ + return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va); +} +#endif /* GUEST_PAGING_LEVELS >= 4 */ + + +#if GUEST_PAGING_LEVELS >= 3 +static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v, + walk_t *gw, + mfn_t *sl3mfn, + fetch_type_t ft) +{ +#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */ + mfn_t sl4mfn; + shadow_l4e_t *sl4e; + if ( !valid_mfn(gw->l3mfn) ) return NULL; /* No guest page. */ + /* Get the l4e */ + sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn); + ASSERT(sl4e != NULL); + if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) + { + *sl3mfn = shadow_l4e_get_mfn(*sl4e); + ASSERT(valid_mfn(*sl3mfn)); + } + else + { + int r; + shadow_l4e_t new_sl4e; + /* No l3 shadow installed: find and install it. */ + *sl3mfn = get_shadow_status(v, gw->l3mfn, PGC_SH_l3_shadow); + if ( !valid_mfn(*sl3mfn) ) + { + /* No l3 shadow of this page exists at all: make one. */ + *sl3mfn = sh_make_shadow(v, gw->l3mfn, PGC_SH_l3_shadow); + } + /* Install the new sl3 table in the sl4e */ + l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn, + *sl3mfn, &new_sl4e, ft); + r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn); + ASSERT((r & SHADOW_SET_FLUSH) == 0); + } + /* Now follow it down a level. Guaranteed to succeed. */ + return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va); +#else /* PAE... */ + /* There is always a shadow of the top level table. Get it. */ + *sl3mfn = pagetable_get_mfn(v->arch.shadow_table); + /* This next line is important: the shadow l3 table is in an 8k + * shadow and we need to return the right mfn of the pair. This call + * will set it for us as a side-effect. */ + (void) shadow_l3_index(sl3mfn, guest_index(gw->l3e)); + ASSERT(v->arch.shadow_vtable); + return ((shadow_l3e_t *)v->arch.shadow_vtable) + + shadow_l3_table_offset(gw->va); +#endif /* GUEST_PAGING_LEVELS >= 4 */ +} +#endif /* GUEST_PAGING_LEVELS >= 3 */ + + +static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v, + walk_t *gw, + mfn_t *sl2mfn, + fetch_type_t ft) +{ +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64bit... */ + mfn_t sl3mfn = _mfn(INVALID_MFN); + shadow_l3e_t *sl3e; + if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */ + /* Get the l3e */ + sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft); + ASSERT(sl3e != NULL); /* Since we know guest PT is valid this far */ + if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) + { + *sl2mfn = shadow_l3e_get_mfn(*sl3e); + ASSERT(valid_mfn(*sl2mfn)); + } + else + { + int r; + shadow_l3e_t new_sl3e; + /* No l2 shadow installed: find and install it. */ + *sl2mfn = get_shadow_status(v, gw->l2mfn, PGC_SH_l2_shadow); + if ( !valid_mfn(*sl2mfn) ) + { + /* No l2 shadow of this page exists at all: make one. */ + *sl2mfn = sh_make_shadow(v, gw->l2mfn, PGC_SH_l2_shadow); + } + /* Install the new sl2 table in the sl3e */ + l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn, + *sl2mfn, &new_sl3e, ft); + r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn); + ASSERT((r & SHADOW_SET_FLUSH) == 0); +#if GUEST_PAGING_LEVELS == 3 + /* Need to sync up the linear maps, as we are about to use them */ + ASSERT( r & SHADOW_SET_L3PAE_RECOPY ); + sh_pae_recopy(v->domain); +#endif + } + /* Now follow it down a level. Guaranteed to succeed. */ + return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va); +#else /* 32bit... */ + /* There is always a shadow of the top level table. Get it. */ + *sl2mfn = pagetable_get_mfn(v->arch.shadow_table); + /* This next line is important: the guest l2 has a 16k + * shadow, we need to return the right mfn of the four. This + * call will set it for us as a side-effect. */ + (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e)); + /* Reading the top level table is always valid. */ + return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va); +#endif +} + + +static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v, + walk_t *gw, + mfn_t *sl1mfn, + fetch_type_t ft) +{ + mfn_t sl2mfn; + shadow_l2e_t *sl2e; + + /* Get the l2e */ + sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft); + if ( sl2e == NULL ) return NULL; + if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) + { + *sl1mfn = shadow_l2e_get_mfn(*sl2e); + ASSERT(valid_mfn(*sl1mfn)); + } + else + { + shadow_l2e_t new_sl2e; + int r, flags = guest_l2e_get_flags(*gw->l2e); + /* No l1 shadow installed: find and install it. */ + if ( !(flags & _PAGE_PRESENT) ) + return NULL; /* No guest page. */ + if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) ) + { + /* Splintering a superpage */ + gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e); + *sl1mfn = get_fl1_shadow_status(v, l2gfn); + if ( !valid_mfn(*sl1mfn) ) + { + /* No fl1 shadow of this superpage exists at all: make one. */ + *sl1mfn = make_fl1_shadow(v, l2gfn); + } + } + else + { + /* Shadowing an actual guest l1 table */ + if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */ + *sl1mfn = get_shadow_status(v, gw->l1mfn, PGC_SH_l1_shadow); + if ( !valid_mfn(*sl1mfn) ) + { + /* No l1 shadow of this page exists at all: make one. */ + *sl1mfn = sh_make_shadow(v, gw->l1mfn, PGC_SH_l1_shadow); + } + } + /* Install the new sl1 table in the sl2e */ + l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn, + *sl1mfn, &new_sl2e, ft); + r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn); + ASSERT((r & SHADOW_SET_FLUSH) == 0); + /* This next line is important: in 32-on-PAE and 32-on-64 modes, + * the guest l1 table has an 8k shadow, and we need to return + * the right mfn of the pair. This call will set it for us as a + * side-effect. (In all other cases, it's a no-op and will be + * compiled out.) */ + (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va)); + } + /* Now follow it down a level. Guaranteed to succeed. */ + return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va); +} + + + +/**************************************************************************/ +/* Destructors for shadow tables: + * Unregister the shadow, decrement refcounts of any entries present in it, + * and release the memory. + * + * N.B. These destructors do not clear the contents of the shadows. + * This allows us to delay TLB shootdowns until the page is being reused. + * See shadow_alloc() and shadow_free() for how this is handled. + */ + +#if GUEST_PAGING_LEVELS >= 4 +void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn) +{ + shadow_l4e_t *sl4e; + u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask; + mfn_t gmfn, sl4mfn; + int xen_mappings; + + SHADOW_DEBUG(DESTROY_SHADOW, + "%s(%05lx)\n", __func__, mfn_x(smfn)); + ASSERT(t == PGC_SH_l4_shadow); + + /* Record that the guest page isn't shadowed any more (in this type) */ + gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); + delete_shadow_status(v, gmfn, t, smfn); + shadow_demote(v, gmfn, t); + /* Take this shadow off the list of root shadows */ + list_del_init(&mfn_to_page(smfn)->list); + + /* Decrement refcounts of all the old entries */ + xen_mappings = (!shadow_mode_external(v->domain)); + sl4mfn = smfn; + SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, { + if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) + { + sh_put_ref(v, shadow_l4e_get_mfn(*sl4e), + (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) + | ((unsigned long)sl4e & ~PAGE_MASK)); + } + }); + + /* Put the memory back in the pool */ + shadow_free(v->domain, smfn); +} +#endif + +#if GUEST_PAGING_LEVELS >= 3 +void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn) +{ + shadow_l3e_t *sl3e; + u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask; + mfn_t gmfn, sl3mfn; + + SHADOW_DEBUG(DESTROY_SHADOW, + "%s(%05lx)\n", __func__, mfn_x(smfn)); + ASSERT(t == PGC_SH_l3_shadow); + + /* Record that the guest page isn't shadowed any more (in this type) */ + gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); + delete_shadow_status(v, gmfn, t, smfn); + shadow_demote(v, gmfn, t); +#if GUEST_PAGING_LEVELS == 3 + /* Take this shadow off the list of root shadows */ + list_del_init(&mfn_to_page(smfn)->list); +#endif + + /* Decrement refcounts of all the old entries */ + sl3mfn = smfn; + SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, { + if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) + sh_put_ref(v, shadow_l3e_get_mfn(*sl3e), + (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) + | ((unsigned long)sl3e & ~PAGE_MASK)); + }); + + /* Put the memory back in the pool */ + shadow_free(v->domain, smfn); +} +#endif + + +#if GUEST_PAGING_LEVELS == 3 +static void sh_destroy_l3_subshadow(struct vcpu *v, + shadow_l3e_t *sl3e) +/* Tear down just a single 4-entry l3 on a 2-page l3 shadow. */ +{ + int i; + ASSERT((unsigned long)sl3e % (4 * sizeof (shadow_l3e_t)) == 0); + for ( i = 0; i < GUEST_L3_PAGETABLE_ENTRIES; i++ ) + if ( shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT ) + sh_put_ref(v, shadow_l3e_get_mfn(sl3e[i]), + maddr_from_mapped_domain_page(sl3e)); +} +#endif + +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) +void sh_unpin_all_l3_subshadows(struct vcpu *v, mfn_t smfn) +/* Walk a full PAE l3 shadow, unpinning all of the subshadows on it */ +{ + int i, j; + struct pae_l3_bookkeeping *bk; + + ASSERT((mfn_to_page(smfn)->count_info & PGC_SH_type_mask) + == PGC_SH_l3_pae_shadow); + /* The subshadows are split, 64 on each page of the shadow */ + for ( i = 0; i < 2; i++ ) + { + void *p = sh_map_domain_page(_mfn(mfn_x(smfn) + i)); + for ( j = 0; j < 64; j++ ) + { + /* Every second 32-byte region is a bookkeeping entry */ + bk = (struct pae_l3_bookkeeping *)(p + (64 * j) + 32); + if ( bk->pinned ) + sh_unpin_l3_subshadow(v, (shadow_l3e_t *)(p + (64*j)), smfn); + /* Check whether we've just freed the whole shadow */ + if ( (mfn_to_page(smfn)->count_info & PGC_SH_count_mask) == 0 ) + { + sh_unmap_domain_page(p); + return; + } + } + sh_unmap_domain_page(p); + } +} +#endif + +void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn) +{ + shadow_l2e_t *sl2e; + u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask; + mfn_t gmfn, sl2mfn; + int xen_mappings; + + SHADOW_DEBUG(DESTROY_SHADOW, + "%s(%05lx)\n", __func__, mfn_x(smfn)); + ASSERT(t == PGC_SH_l2_shadow + || t == PGC_SH_l2h_pae_shadow); + + /* Record that the guest page isn't shadowed any more (in this type) */ + gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); + delete_shadow_status(v, gmfn, t, smfn); + shadow_demote(v, gmfn, t); +#if GUEST_PAGING_LEVELS == 2 + /* Take this shadow off the list of root shadows */ + list_del_init(&mfn_to_page(smfn)->list); +#endif + + /* Decrement refcounts of all the old entries */ + sl2mfn = smfn; + xen_mappings = (!shadow_mode_external(v->domain) && + ((GUEST_PAGING_LEVELS == 2) || + ((GUEST_PAGING_LEVELS == 3) && + (t == PGC_SH_l2h_pae_shadow)))); + SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, { + if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) + sh_put_ref(v, shadow_l2e_get_mfn(*sl2e), + (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT) + | ((unsigned long)sl2e & ~PAGE_MASK)); + }); + + /* Put the memory back in the pool */ + shadow_free(v->domain, smfn); +} + +void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn) +{ + struct domain *d = v->domain; + shadow_l1e_t *sl1e; + u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask; + + SHADOW_DEBUG(DESTROY_SHADOW, + "%s(%05lx)\n", __func__, mfn_x(smfn)); + ASSERT(t == PGC_SH_l1_shadow || t == PGC_SH_fl1_shadow); + + /* Record that the guest page isn't shadowed any more (in this type) */ + if ( t == PGC_SH_fl1_shadow ) + { + gfn_t gfn = _gfn(mfn_to_page(smfn)->u.inuse.type_info); + delete_fl1_shadow_status(v, gfn, smfn); + } + else + { + mfn_t gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); + delete_shadow_status(v, gmfn, t, smfn); + shadow_demote(v, gmfn, t); + } + + if ( shadow_mode_refcounts(d) ) + { + /* Decrement refcounts of all the old entries */ + mfn_t sl1mfn = smfn; + SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, { + if ( shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT ) + shadow_put_page_from_l1e(*sl1e, d); + }); + } + + /* Put the memory back in the pool */ + shadow_free(v->domain, smfn); +} + +#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS +void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn) +{ + struct domain *d = v->domain; + ASSERT((mfn_to_page(mmfn)->count_info & PGC_SH_type_mask) + == PGC_SH_monitor_table); + +#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4) + /* Need to destroy the l3 monitor page in slot 0 too */ + { + l4_pgentry_t *l4e = sh_map_domain_page(mmfn); + ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT); + shadow_free(d, _mfn(l4e_get_pfn(l4e[0]))); + sh_unmap_domain_page(l4e); + } +#elif CONFIG_PAGING_LEVELS == 3 + /* Need to destroy the l2 monitor page in slot 4 too */ + { + l3_pgentry_t *l3e = sh_map_domain_page(mmfn); + ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT); + shadow_free(d, _mfn(l3e_get_pfn(l3e[3]))); + sh_unmap_domain_page(l3e); + } +#endif + + /* Put the memory back in the pool */ + shadow_free(d, mmfn); +} +#endif + +/**************************************************************************/ +/* Functions to destroy non-Xen mappings in a pagetable hierarchy. + * These are called from common code when we are running out of shadow + * memory, and unpinning all the top-level shadows hasn't worked. + * + * This implementation is pretty crude and slow, but we hope that it won't + * be called very often. */ + +#if GUEST_PAGING_LEVELS == 2 + +void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn) +{ + shadow_l2e_t *sl2e; + int xen_mappings = !shadow_mode_external(v->domain); + SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, { + (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn); + }); +} + +#elif GUEST_PAGING_LEVELS == 3 + +void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl3mfn) +/* Walk a full PAE l3 shadow, unhooking entries from all the subshadows */ +{ + shadow_l3e_t *sl3e; + SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, { + if ( (shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) { + mfn_t sl2mfn = shadow_l3e_get_mfn(*sl3e); + if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH_type_mask) + == PGC_SH_l2h_pae_shadow ) + { + /* High l2: need to pick particular l2es to unhook */ + shadow_l2e_t *sl2e; + SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, 1, { + (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn); + }); + } + else + { + /* Normal l2: can safely unhook the whole l3e */ + (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn); + } + } + }); + /* We've changed PAE L3 entries: must sync up various copies of them */ + sh_pae_recopy(v->domain); +} + +#elif GUEST_PAGING_LEVELS == 4 + +void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn) +{ + shadow_l4e_t *sl4e; + int xen_mappings = !shadow_mode_external(v->domain); + SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, { + (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn); + }); +} + +#endif + +/**************************************************************************/ +/* Internal translation functions. + * These functions require a pointer to the shadow entry that will be updated. + */ + +/* These functions take a new guest entry, translate it to shadow and write + * the shadow entry. + * + * They return the same bitmaps as the shadow_set_lXe() functions. + */ + +#if GUEST_PAGING_LEVELS >= 4 +static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se) +{ + shadow_l4e_t new_sl4e; + guest_l4e_t *new_gl4e = new_ge; + shadow_l4e_t *sl4p = se; + mfn_t sl3mfn = _mfn(INVALID_MFN); + int result = 0; + + perfc_incrc(shadow_validate_gl4e_calls); + + if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT ) + { + gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e); + mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn); + if ( valid_mfn(gl3mfn) ) + sl3mfn = get_shadow_status(v, gl3mfn, PGC_SH_l3_shadow); + else + result |= SHADOW_SET_ERROR; + } + l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN), + sl3mfn, &new_sl4e, ft_prefetch); + result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn); + return result; +} +#endif // GUEST_PAGING_LEVELS >= 4 + +#if GUEST_PAGING_LEVELS >= 3 +static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se) +{ + shadow_l3e_t new_sl3e; + guest_l3e_t *new_gl3e = new_ge; + shadow_l3e_t *sl3p = se; + mfn_t sl2mfn = _mfn(INVALID_MFN); + int result = 0; + + perfc_incrc(shadow_validate_gl3e_calls); + + if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT ) + { + gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e); + mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn); + if ( valid_mfn(gl2mfn) ) + sl2mfn = get_shadow_status(v, gl2mfn, PGC_SH_l2_shadow); + else + result |= SHADOW_SET_ERROR; + } + l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN), + sl2mfn, &new_sl3e, ft_prefetch); + result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn); + +#if GUEST_PAGING_LEVELS == 3 + /* We have changed a PAE l3 entry: need to sync up the possible copies + * of it */ + if ( result & SHADOW_SET_L3PAE_RECOPY ) + sh_pae_recopy(v->domain); +#endif + + return result; +} +#endif // GUEST_PAGING_LEVELS >= 3 + +static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se) +{ + shadow_l2e_t new_sl2e; + guest_l2e_t *new_gl2e = new_ge; + shadow_l2e_t *sl2p = se; + mfn_t sl1mfn = _mfn(INVALID_MFN); + int result = 0; + + perfc_incrc(shadow_validate_gl2e_calls); + + if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT ) + { + gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e); + if ( guest_supports_superpages(v) && + (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) ) + { + // superpage -- need to look up the shadow L1 which holds the + // splitters... + sl1mfn = get_fl1_shadow_status(v, gl1gfn); +#if 0 + // XXX - it's possible that we want to do some kind of prefetch + // for superpage fl1's here, but this is *not* on the demand path, + // so we'll hold off trying that for now... + // + if ( !valid_mfn(sl1mfn) ) + sl1mfn = make_fl1_shadow(v, gl1gfn); +#endif + } + else + { + mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn); + if ( valid_mfn(gl1mfn) ) + sl1mfn = get_shadow_status(v, gl1mfn, PGC_SH_l1_shadow); + else + result |= SHADOW_SET_ERROR; + } + } + l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN), + sl1mfn, &new_sl2e, ft_prefetch); + result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn); + + return result; +} + +static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se) +{ + shadow_l1e_t new_sl1e; + guest_l1e_t *new_gl1e = new_ge; + shadow_l1e_t *sl1p = se; + gfn_t gfn; + mfn_t mfn; + int result = 0; + + perfc_incrc(shadow_validate_gl1e_calls); + + gfn = guest_l1e_get_gfn(*new_gl1e); + mfn = vcpu_gfn_to_mfn(v, gfn); + + l1e_propagate_from_guest(v, *new_gl1e, &new_sl1e, + /* mmio? */ !valid_mfn(mfn)); + + result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn); + return result; +} + + +/**************************************************************************/ +/* Functions which translate and install a the shadows of arbitrary guest + * entries that we have just seen the guest write. */ + + +static inline int +sh_map_and_validate(struct vcpu *v, mfn_t gmfn, + void *new_gp, u32 size, u32 sh_type, + u32 (*shadow_index)(mfn_t *smfn, u32 idx), + int (*validate_ge)(struct vcpu *v, void *ge, + mfn_t smfn, void *se)) +/* Generic function for mapping and validating. */ +{ + mfn_t smfn, smfn2, map_mfn; + shadow_l1e_t *sl1p; + u32 shadow_idx, guest_idx; + int result = 0; + + /* Align address and size to guest entry boundaries */ + size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1); + new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1)); + size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1); + ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE); + + /* Map the shadow page */ + smfn = get_shadow_status(v, gmfn, sh_type); + ASSERT(valid_mfn(smfn)); /* Otherwise we would not have been called */ + guest_idx = guest_index(new_gp); + map_mfn = smfn; + shadow_idx = shadow_index(&map_mfn, guest_idx); + sl1p = map_shadow_page(map_mfn); + + /* Validate one entry at a time */ + while ( size ) + { + smfn2 = smfn; + guest_idx = guest_index(new_gp); + shadow_idx = shadow_index(&smfn2, guest_idx); + if ( mfn_x(smfn2) != mfn_x(map_mfn) ) + { + /* We have moved to another page of the shadow */ + map_mfn = smfn2; + unmap_shadow_page(sl1p); + sl1p = map_shadow_page(map_mfn); + } + result |= validate_ge(v, + new_gp, + map_mfn, + &sl1p[shadow_idx]); + size -= sizeof(guest_l1e_t); + new_gp += sizeof(guest_l1e_t); + } + unmap_shadow_page(sl1p); + return result; +} + + +int +sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn, + void *new_gl4p, u32 size) +{ +#if GUEST_PAGING_LEVELS >= 4 + return sh_map_and_validate(v, gl4mfn, new_gl4p, size, + PGC_SH_l4_shadow, + shadow_l4_index, + validate_gl4e); +#else // ! GUEST_PAGING_LEVELS >= 4 + SHADOW_PRINTK("called in wrong paging mode!\n"); + BUG(); + return 0; +#endif +} + +int +sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn, + void *new_gl3p, u32 size) +{ +#if GUEST_PAGING_LEVELS >= 3 + return sh_map_and_validate(v, gl3mfn, new_gl3p, size, + PGC_SH_l3_shadow, + shadow_l3_index, + validate_gl3e); +#else // ! GUEST_PAGING_LEVELS >= 3 + SHADOW_PRINTK("called in wrong paging mode!\n"); + BUG(); + return 0; +#endif +} + +int +sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn, + void *new_gl2p, u32 size) +{ + return sh_map_and_validate(v, gl2mfn, new_gl2p, size, + PGC_SH_l2_shadow, + shadow_l2_index, + validate_gl2e); +} + +int +sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn, + void *new_gl2p, u32 size) +{ +#if GUEST_PAGING_LEVELS == 3 + return sh_map_and_validate(v, gl2mfn, new_gl2p, size, + PGC_SH_l2h_shadow, + shadow_l2_index, + validate_gl2e); +#else /* Non-PAE guests don't have different kinds of l2 table */ + SHADOW_PRINTK("called in wrong paging mode!\n"); + BUG(); + return 0; +#endif +} + +int +sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn, + void *new_gl1p, u32 size) +{ + return sh_map_and_validate(v, gl1mfn, new_gl1p, size, + PGC_SH_l1_shadow, + shadow_l1_index, + validate_gl1e); +} + + +/**************************************************************************/ +/* Optimization: If we see two emulated writes of zeros to the same + * page-table without another kind of page fault in between, we guess + * that this is a batch of changes (for process destruction) and + * unshadow the page so we don't take a pagefault on every entry. This + * should also make finding writeable mappings of pagetables much + * easier. */ + +/* Look to see if this is the second emulated write in a row to this + * page, and unshadow/unhook if it is */ +static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn) +{ +#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW + if ( v->arch.shadow.last_emulated_mfn == mfn_x(gmfn) && + sh_mfn_is_a_page_table(gmfn) ) + { + u32 flags = mfn_to_page(gmfn)->shadow_flags; + mfn_t smfn; + if ( !(flags & (SHF_L2_32|SHF_L3_PAE|SHF_L4_64)) ) + { + perfc_incrc(shadow_early_unshadow); + sh_remove_shadows(v, gmfn, 0 /* Can fail to unshadow */ ); + return; + } + /* SHF_unhooked_mappings is set to make sure we only unhook + * once in a single batch of updates. It is reset when this + * top-level page is loaded into CR3 again */ + if ( !(flags & SHF_unhooked_mappings) ) + { + perfc_incrc(shadow_early_unshadow_top); + mfn_to_page(gmfn)->shadow_flags |= SHF_unhooked_mappings; + if ( flags & SHF_L2_32 ) + { + smfn = get_shadow_status(v, gmfn, PGC_SH_l2_32_shadow); + shadow_unhook_mappings(v, smfn); + } + if ( flags & SHF_L3_PAE ) + { + smfn = get_shadow_status(v, gmfn, PGC_SH_l3_pae_shadow); + shadow_unhook_mappings(v, smfn); + } + if ( flags & SHF_L4_64 ) + { + smfn = get_shadow_status(v, gmfn, PGC_SH_l4_64_shadow); + shadow_unhook_mappings(v, smfn); + } + } + } + v->arch.shadow.last_emulated_mfn = mfn_x(gmfn); +#endif +} + +/* Stop counting towards early unshadows, as we've seen a real page fault */ +static inline void reset_early_unshadow(struct vcpu *v) +{ +#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW + v->arch.shadow.last_emulated_mfn = INVALID_MFN; +#endif +} + + + +/**************************************************************************/ +/* Entry points into the shadow code */ + +/* Called from pagefault handler in Xen, and from the HVM trap handlers + * for pagefaults. Returns 1 if this fault was an artefact of the + * shadow code (and the guest should retry) or 0 if it is not (and the + * fault should be handled elsewhere or passed to the guest). */ + +static int sh_page_fault(struct vcpu *v, + unsigned long va, + struct cpu_user_regs *regs) +{ + struct domain *d = v->domain; + walk_t gw; + u32 accumulated_gflags; + gfn_t gfn; + mfn_t gmfn, sl1mfn=_mfn(0); + shadow_l1e_t sl1e, *ptr_sl1e; + paddr_t gpa; + struct cpu_user_regs emul_regs; + struct x86_emulate_ctxt emul_ctxt; + int r, mmio; + fetch_type_t ft = 0; + + // + // XXX: Need to think about eventually mapping superpages directly in the + // shadow (when possible), as opposed to splintering them into a + // bunch of 4K maps. + // + + SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n", + v->domain->domain_id, v->vcpu_id, va, regs->error_code); + + shadow_lock(d); + + shadow_audit_tables(v); + + if ( guest_walk_tables(v, va, &gw, 1) != 0 ) + { + SHADOW_PRINTK("malformed guest pagetable!"); + print_gw(&gw); + } + + sh_audit_gw(v, &gw); + + // We do not look at the gw->l1e, as that will not exist for superpages. + // Instead, we use the gw->eff_l1e... + // + // We need not check all the levels of the guest page table entries for + // present vs not-present, as the eff_l1e will always be not present if + // one of the higher level entries is not present. + // + if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) ) + { + if ( hvm_guest(v) && !shadow_vcpu_mode_translate(v) ) + { + /* Not present in p2m map, means this is mmio */ + gpa = va; + goto mmio; + } + + perfc_incrc(shadow_fault_bail_not_present); + goto not_a_shadow_fault; + } + + // All levels of the guest page table are now known to be present. + accumulated_gflags = accumulate_guest_flags(&gw); + + // Check for attempts to access supervisor-only pages from user mode, + // i.e. ring 3. Such errors are not caused or dealt with by the shadow + // code. + // + if ( (regs->error_code & PFEC_user_mode) && + !(accumulated_gflags & _PAGE_USER) ) + { + /* illegal user-mode access to supervisor-only page */ + perfc_incrc(shadow_fault_bail_user_supervisor); + goto not_a_shadow_fault; + } + + // Was it a write fault? + // + if ( regs->error_code & PFEC_write_access ) + { + if ( unlikely(!(accumulated_gflags & _PAGE_RW)) ) + { + perfc_incrc(shadow_fault_bail_ro_mapping); + goto not_a_shadow_fault; + } + } + else // must have been either an insn fetch or read fault + { + // Check for NX bit violations: attempts to execute code that is + // marked "do not execute". Such errors are not caused or dealt with + // by the shadow code. + // + if ( regs->error_code & PFEC_insn_fetch ) + { + if ( accumulated_gflags & _PAGE_NX_BIT ) + { + /* NX prevented this code fetch */ + perfc_incrc(shadow_fault_bail_nx); + goto not_a_shadow_fault; + } + } + } + + /* Is this an MMIO access? */ + gfn = guest_l1e_get_gfn(gw.eff_l1e); + mmio = ( hvm_guest(v) + && shadow_vcpu_mode_translate(v) + && mmio_space(gfn_to_paddr(gfn)) ); + + /* For MMIO, the shadow holds the *gfn*; for normal accesses, if holds + * the equivalent mfn. */ + if ( mmio ) + gmfn = _mfn(gfn_x(gfn)); + else + { + gmfn = vcpu_gfn_to_mfn(v, gfn); + if ( !valid_mfn(gmfn) ) + { + perfc_incrc(shadow_fault_bail_bad_gfn); + SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"SH_PRI_mfn"\n", + gfn_x(gfn), mfn_x(gmfn)); + goto not_a_shadow_fault; + } + } + + /* Make sure there is enough free shadow memory to build a chain of + * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough + * to allocate all we need. (We never allocate a top-level shadow + * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */ + shadow_prealloc(d, SHADOW_MAX_ORDER); + + /* Acquire the shadow. This must happen before we figure out the rights + * for the shadow entry, since we might promote a page here. */ + // XXX -- this code will need to change somewhat if/when the shadow code + // can directly map superpages... + ft = ((regs->error_code & PFEC_write_access) ? + ft_demand_write : ft_demand_read); + ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft); + ASSERT(ptr_sl1e); + + /* Calculate the shadow entry */ + if ( ft == ft_demand_write ) + { + if ( l1e_write_fault(v, &gw, gmfn, &sl1e, mmio) ) + { + perfc_incrc(shadow_fault_emulate_write); + goto emulate; + } + } + else if ( l1e_read_fault(v, &gw, gmfn, &sl1e, mmio) ) + { + perfc_incrc(shadow_fault_emulate_read); + goto emulate; + } + + /* Quick sanity check: we never make an MMIO entry that's got the + * _PAGE_PRESENT flag set in it. */ + ASSERT(!mmio || !(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT)); + + r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn); + + if ( mmio ) + { + gpa = guest_walk_to_gpa(&gw); + goto mmio; + } + +#if 0 + if ( !(r & SHADOW_SET_CHANGED) ) + debugtrace_printk("%s: shadow_set_l1e(va=%p, sl1e=%" SH_PRI_pte + ") did not change anything\n", + __func__, gw.va, l1e_get_intpte(sl1e)); +#endif + + perfc_incrc(shadow_fault_fixed); + d->arch.shadow.fault_count++; + reset_early_unshadow(v); + + done: + sh_audit_gw(v, &gw); + unmap_walk(v, &gw); + SHADOW_PRINTK("fixed\n"); + shadow_audit_tables(v); + shadow_unlock(d); + return EXCRET_fault_fixed; + + emulate: + + /* Take the register set we were called with */ + emul_regs = *regs; + if ( hvm_guest(v) ) + { + /* Add the guest's segment selectors, rip, rsp. rflags */ + hvm_store_cpu_guest_regs(v, &emul_regs, NULL); + } + emul_ctxt.regs = &emul_regs; + emul_ctxt.cr2 = va; + emul_ctxt.mode = hvm_guest(v) ? hvm_guest_x86_mode(v) : X86EMUL_MODE_HOST; + + SHADOW_PRINTK("emulate: eip=%#lx\n", emul_regs.eip); + + v->arch.shadow.propagate_fault = 0; + if ( x86_emulate_memop(&emul_ctxt, &shadow_emulator_ops) ) + { + SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n", + mfn_x(gmfn)); + perfc_incrc(shadow_fault_emulate_failed); + /* If this is actually a page table, then we have a bug, and need + * to support more operations in the emulator. More likely, + * though, this is a hint that this page should not be shadowed. */ + shadow_remove_all_shadows(v, gmfn); + /* This means that actual missing operations will cause the + * guest to loop on the same page fault. */ + goto done; + } + if ( v->arch.shadow.propagate_fault ) + { + /* Emulation triggered another page fault */ + goto not_a_shadow_fault; + } + + /* Emulator has changed the user registers: write back */ + if ( hvm_guest(v) ) + { + /* Write back the guest's segment selectors, rip, rsp. rflags */ + hvm_load_cpu_guest_regs(v, &emul_regs); + /* And don't overwrite those in the caller's regs. */ + emul_regs.eip = regs->eip; + emul_regs.cs = regs->cs; + emul_regs.eflags = regs->eflags; + emul_regs.esp = regs->esp; + emul_regs.ss = regs->ss; + emul_regs.es = regs->es; + emul_regs.ds = regs->ds; + emul_regs.fs = regs->fs; + emul_regs.gs = regs->gs; + } + *regs = emul_regs; + + goto done; + + mmio: + perfc_incrc(shadow_fault_mmio); + if ( !hvm_apic_support(d) && (gpa >= 0xFEC00000) ) + { + /* Need to deal with these disabled-APIC accesses, as + * handle_mmio() apparently does not currently do that. */ + /* TJD: What about it, then? For now, I'm turning this BUG() + * into a domain_crash() since we don't want to kill Xen. */ + SHADOW_ERROR("disabled-APIC access: not supported\n."); + domain_crash(d); + } + sh_audit_gw(v, &gw); + unmap_walk(v, &gw); + SHADOW_PRINTK("mmio\n"); + shadow_audit_tables(v); + reset_early_unshadow(v); + shadow_unlock(d); + sh_log_mmio(v, gpa); + handle_mmio(va, gpa); + return EXCRET_fault_fixed; + + not_a_shadow_fault: + sh_audit_gw(v, &gw); + unmap_walk(v, &gw); + SHADOW_PRINTK("not a shadow fault\n"); + shadow_audit_tables(v); + reset_early_unshadow(v); + shadow_unlock(d); + return 0; +} + + +static int +sh_invlpg(struct vcpu *v, unsigned long va) +/* Called when the guest requests an invlpg. Returns 1 if the invlpg + * instruction should be issued on the hardware, or 0 if it's safe not + * to do so. */ +{ + shadow_l2e_t *ptr_sl2e = shadow_get_l2e(v, va); + + // XXX -- might be a good thing to prefetch the va into the shadow + + // no need to flush anything if there's no SL2... + // + if ( !ptr_sl2e ) + return 0; + + // If there's nothing shadowed for this particular sl2e, then + // there is no need to do an invlpg, either... + // + if ( !(shadow_l2e_get_flags(*ptr_sl2e) & _PAGE_PRESENT) ) + return 0; + + // Check to see if the SL2 is a splintered superpage... + // If so, then we'll need to flush the entire TLB (because that's + // easier than invalidating all of the individual 4K pages). + // + if ( (mfn_to_page(shadow_l2e_get_mfn(*ptr_sl2e))->count_info & + PGC_SH_type_mask) == PGC_SH_fl1_shadow ) + { + local_flush_tlb(); + return 0; + } + + return 1; +} + +static unsigned long +sh_gva_to_gfn(struct vcpu *v, unsigned long va) +/* Called to translate a guest virtual address to what the *guest* + * pagetables would map it to. */ +{ + walk_t gw; + gfn_t gfn; + + guest_walk_tables(v, va, &gw, 0); + gfn = guest_walk_to_gfn(&gw); + unmap_walk(v, &gw); + + return gfn_x(gfn); +} + + +static unsigned long +sh_gva_to_gpa(struct vcpu *v, unsigned long va) +/* Called to translate a guest virtual address to what the *guest* + * pagetables would map it to. */ +{ + unsigned long gfn = sh_gva_to_gfn(v, va); + if ( gfn == INVALID_GFN ) + return 0; + else + return (gfn << PAGE_SHIFT) | (va & ~PAGE_MASK); +} + + +// XXX -- should this be in this file? +// Or should it be moved to shadow-common.c? +// +/* returns a lowmem machine address of the copied HVM L3 root table + * If clear_res != 0, then clear the PAE-l3 reserved bits in the copy, + * otherwise blank out any entries with reserved bits in them. */ +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) +static unsigned long +hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res) +{ + int i, f; + int res = (_PAGE_RW|_PAGE_NX_BIT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY); + l3_pgentry_t new_l3e, *copy = v->arch.hvm_vcpu.hvm_lowmem_l3tab; + memcpy(copy, l3tab, 4 * sizeof(l3_pgentry_t)); + for ( i = 0; i < 4; i++ ) + { + f = l3e_get_flags(l3tab[i]); + if ( (f & _PAGE_PRESENT) && (!(f & res) || clear_res) ) + new_l3e = l3e_from_pfn(l3e_get_pfn(l3tab[i]), f & ~res); + else + new_l3e = l3e_empty(); + safe_write_entry(©[i], &new_l3e); + } + return __pa(copy); +} +#endif + + +static inline void +sh_update_linear_entries(struct vcpu *v) +/* Sync up all the linear mappings for this vcpu's pagetables */ +{ + struct domain *d = v->domain; + + /* Linear pagetables in PV guests + * ------------------------------ + * + * Guest linear pagetables, which map the guest pages, are at + * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the + * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these + * are set up at shadow creation time, but (of course!) the PAE case + * is subtler. Normal linear mappings are made by having an entry + * in the top-level table that points to itself (shadow linear) or + * to the guest top-level table (guest linear). For PAE, to set up + * a linear map requires us to copy the four top-level entries into + * level-2 entries. That means that every time we change a PAE l3e, + * we need to reflect the change into the copy. + * + * Linear pagetables in HVM guests + * ------------------------------- + * + * For HVM guests, the linear pagetables are installed in the monitor + * tables (since we can't put them in the shadow). Shadow linear + * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START, + * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for + * a linear pagetable of the monitor tables themselves. We have + * the same issue of having to re-copy PAE l3 entries whevever we use + * PAE shadows. + * + * Because HVM guests run on the same monitor tables regardless of the + * shadow tables in use, the linear mapping of the shadow tables has to + * be updated every time v->arch.shadow_table changes. + */ + + /* Don't try to update the monitor table if it doesn't exist */ + if ( shadow_mode_external(d) + && pagetable_get_pfn(v->arch.monitor_table) == 0 ) + return; + +#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4) + + /* For PV, one l4e points at the guest l4, one points at the shadow + * l4. No maintenance required. + * For HVM, just need to update the l4e that points to the shadow l4. */ + + if ( shadow_mode_external(d) ) + { + /* Use the linear map if we can; otherwise make a new mapping */ + if ( v == current ) + { + __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] = + l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), + __PAGE_HYPERVISOR); + } + else + { + l4_pgentry_t *ml4e; + ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); + ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] = + l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), + __PAGE_HYPERVISOR); + sh_unmap_domain_page(ml4e); + } + } + +#elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3) + + /* This case only exists in HVM. To give ourselves a linear map of the + * shadows, we need to extend a PAE shadow to 4 levels. We do this by + * having a monitor l3 in slot 0 of the monitor l4 table, and + * copying the PAE l3 entries into it. Then, by having the monitor l4e + * for shadow pagetables also point to the monitor l4, we can use it + * to access the shadows. */ + + if ( shadow_mode_external(d) ) + { + /* Install copies of the shadow l3es into the monitor l3 table. + * The monitor l3 table is hooked into slot 0 of the monitor + * l4 table, so we use l3 linear indices 0 to 3 */ + shadow_l3e_t *sl3e; + l3_pgentry_t *ml3e; + mfn_t l3mfn; + int i; + + /* Use linear mappings if we can; otherwise make new mappings */ + if ( v == current ) + { + ml3e = __linear_l3_table; + l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0])); +#if GUEST_PAGING_LEVELS == 2 + /* Shadow l3 tables are made up by update_cr3 */ + sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab; +#else + sl3e = v->arch.shadow_vtable; +#endif + } + else + { + l4_pgentry_t *ml4e; + ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); + ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT); + l3mfn = _mfn(l4e_get_pfn(ml4e[0])); + ml3e = sh_map_domain_page(l3mfn); + sh_unmap_domain_page(ml4e); +#if GUEST_PAGING_LEVELS == 2 + /* Shadow l3 tables are made up by update_cr3 */ + sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab; +#else + sl3e = sh_map_domain_page(pagetable_get_mfn(v->arch.shadow_table)); +#endif + } + + for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) + { + ml3e[i] = + (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT) + ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])), + __PAGE_HYPERVISOR) + : l3e_empty(); + } + + if ( v != current ) + { + sh_unmap_domain_page(ml3e); +#if GUEST_PAGING_LEVELS != 2 + sh_unmap_domain_page(sl3e); +#endif + } + } + +#elif CONFIG_PAGING_LEVELS == 3 + + /* PV: need to copy the guest's l3 entries into the guest-linear-map l2 + * entries in the shadow, and the shadow's l3 entries into the + * shadow-linear-map l2 entries in the shadow. This is safe to do + * because Xen does not let guests share high-slot l2 tables between l3s, + * so we know we're not treading on anyone's toes. + * + * HVM: need to copy the shadow's l3 entries into the + * shadow-linear-map l2 entries in the monitor table. This is safe + * because we have one monitor table for each vcpu. The monitor's + * own l3es don't need to be copied because they never change. + * XXX That might change if we start stuffing things into the rest + * of the monitor's virtual address space. + */ + { + l2_pgentry_t *l2e, new_l2e; + shadow_l3e_t *guest_l3e = NULL, *shadow_l3e; + int i; + +#if GUEST_PAGING_LEVELS == 2 + /* Shadow l3 tables were built by update_cr3 */ + if ( shadow_mode_external(d) ) + shadow_l3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab; + else + BUG(); /* PV 2-on-3 is not supported yet */ + +#else /* GUEST_PAGING_LEVELS == 3 */ + + /* Use local vcpu's mappings if we can; otherwise make new mappings */ + if ( v == current ) + { + shadow_l3e = v->arch.shadow_vtable; + if ( !shadow_mode_external(d) ) + guest_l3e = v->arch.guest_vtable; + } + else + { + mfn_t smfn; + int idx; + + /* Map the shadow l3 */ + smfn = pagetable_get_mfn(v->arch.shadow_table); + idx = shadow_l3_index(&smfn, guest_index(v->arch.shadow_vtable)); + shadow_l3e = sh_map_domain_page(smfn); + shadow_l3e += idx; + if ( !shadow_mode_external(d) ) + { + /* Also the guest l3 */ + mfn_t gmfn = pagetable_get_mfn(v->arch.guest_table); + guest_l3e = sh_map_domain_page(gmfn); + guest_l3e += guest_index(v->arch.guest_vtable); + } + } +#endif /* GUEST_PAGING_LEVELS */ + + /* Choose where to write the entries, using linear maps if possible */ + if ( v == current && shadow_mode_external(d) ) + { + /* From the monitor tables, it's safe to use linear maps to update + * monitor l2s */ + l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES); + } + else if ( shadow_mode_external(d) ) + { + /* Map the monitor table's high l2 */ + l3_pgentry_t *l3e; + l3e = sh_map_domain_page( + pagetable_get_mfn(v->arch.monitor_table)); + ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT); + l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3]))); + sh_unmap_domain_page(l3e); + } + else + { + /* Map the shadow table's high l2 */ + ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT); + l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3])); + } + + + if ( !shadow_mode_external(d) ) + { + /* Write linear mapping of guest. */ + for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) + { + new_l2e = (shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT) + ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])), + __PAGE_HYPERVISOR) + : l2e_empty(); + safe_write_entry( + &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], + &new_l2e); + } + } + + /* Write linear mapping of shadow. */ + for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) + { + new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT) + ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])), + __PAGE_HYPERVISOR) + : l2e_empty(); + safe_write_entry( + &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i], + &new_l2e); + } + + if ( v != current || !shadow_mode_external(d) ) + sh_unmap_domain_page(l2e); + +#if GUEST_PAGING_LEVELS == 3 + if ( v != current) + { + sh_unmap_domain_page(shadow_l3e); + if ( !shadow_mode_external(d) ) + sh_unmap_domain_page(guest_l3e); + } +#endif + } + +#elif CONFIG_PAGING_LEVELS == 2 + + /* For PV, one l2e points at the guest l2, one points at the shadow + * l2. No maintenance required. + * For HVM, just need to update the l2e that points to the shadow l2. */ + + if ( shadow_mode_external(d) ) + { + /* Use the linear map if we can; otherwise make a new mapping */ + if ( v == current ) + { + __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] = + l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), + __PAGE_HYPERVISOR); + } + else + { + l2_pgentry_t *ml2e; + ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); + ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = + l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), + __PAGE_HYPERVISOR); + sh_unmap_domain_page(ml2e); + } + } + +#else +#error this should not happen +#endif +} + + +// XXX -- should this be in this file? +// Or should it be moved to shadow-common.c? +// +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) +void sh_pae_recopy(struct domain *d) +/* Called whenever we write to the l3 entries of a PAE pagetable which + * is currently in use. Each vcpu that is using the table needs to + * resync its copies of the l3s in linear maps and any low-memory + * copies it might have made for fitting into 32bit CR3. + * Since linear maps are also resynced when we change CR3, we don't + * need to worry about changes to PAE l3es that are not currently in use.*/ +{ + struct vcpu *v; + cpumask_t flush_mask = CPU_MASK_NONE; + ASSERT(shadow_lock_is_acquired(d)); + + for_each_vcpu(d, v) + { + if ( !v->arch.shadow.pae_flip_pending ) + continue; + + cpu_set(v->processor, flush_mask); + + SHADOW_PRINTK("d=%u v=%u\n", v->domain->domain_id, v->vcpu_id); + + /* This vcpu has a copy in its linear maps */ + sh_update_linear_entries(v); + if ( hvm_guest(v) ) + { + /* This vcpu has a copy in its HVM PAE l3 */ + v->arch.hvm_vcpu.hw_cr3 = + hvm_pae_copy_root(v, v->arch.shadow_vtable, + !shadow_vcpu_mode_translate(v)); + } +#if CONFIG_PAGING_LEVELS == 3 + else + { + /* This vcpu might have copied the l3 to below 4GB */ + if ( v->arch.cr3 >> PAGE_SHIFT + != pagetable_get_pfn(v->arch.shadow_table) ) + { + /* Recopy to where that copy is. */ + int i; + l3_pgentry_t *dst, *src; + dst = __va(v->arch.cr3 & ~0x1f); /* Mask cache control bits */ + src = v->arch.shadow_vtable; + for ( i = 0 ; i < 4 ; i++ ) + safe_write_entry(dst + i, src + i); + } + } +#endif + v->arch.shadow.pae_flip_pending = 0; + } + + flush_tlb_mask(flush_mask); +} +#endif /* (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) */ + + +/* removes: + * vcpu->arch.guest_vtable + * vcpu->arch.shadow_table + * vcpu->arch.shadow_vtable + * Does all appropriate management/bookkeeping/refcounting/etc... + */ +static void +sh_detach_old_tables(struct vcpu *v) +{ + mfn_t smfn; + + //// + //// vcpu->arch.guest_vtable + //// + if ( (shadow_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) && + v->arch.guest_vtable ) + { + // Q: why does this need to use (un)map_domain_page_*global* ? + sh_unmap_domain_page_global(v->arch.guest_vtable); + v->arch.guest_vtable = NULL; + } + + //// + //// vcpu->arch.shadow_table + //// + smfn = pagetable_get_mfn(v->arch.shadow_table); + if ( mfn_x(smfn) ) + { + ASSERT(v->arch.shadow_vtable); + +#if GUEST_PAGING_LEVELS == 3 + // PAE guests do not (necessarily) use an entire page for their + // 4-entry L3s, so we have to deal with them specially. + // + sh_put_ref_l3_subshadow(v, v->arch.shadow_vtable, smfn); +#else + sh_put_ref(v, smfn, 0); +#endif + +#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3) + { + struct pae_l3_bookkeeping *info = + sl3p_to_info(v->arch.shadow_vtable); + ASSERT(test_bit(v->vcpu_id, &info->vcpus)); + clear_bit(v->vcpu_id, &info->vcpus); + } +#endif + v->arch.shadow_table = pagetable_null(); + } + + //// + //// vcpu->arch.shadow_vtable + //// + if ( (shadow_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) && + v->arch.shadow_vtable ) + { + // Q: why does this need to use (un)map_domain_page_*global* ? + // + sh_unmap_domain_page_global(v->arch.shadow_vtable); + v->arch.shadow_vtable = NULL; + } +} + +static void +sh_update_cr3(struct vcpu *v) +/* Updates vcpu->arch.shadow_table after the guest has changed CR3. + * Paravirtual guests should set v->arch.guest_table (and guest_table_user, + * if appropriate). + * HVM guests should also set hvm_get_guest_cntl_reg(v, 3)... + */ +{ + struct domain *d = v->domain; + mfn_t gmfn, smfn; +#if GUEST_PAGING_LEVELS == 3 + u32 guest_idx=0; +#endif + + ASSERT(shadow_lock_is_acquired(v->domain)); + ASSERT(v->arch.shadow.mode); + + //// + //// vcpu->arch.guest_table is already set + //// + +#ifndef NDEBUG + /* Double-check that the HVM code has sent us a sane guest_table */ + if ( hvm_guest(v) ) + { + gfn_t gfn; + + ASSERT(shadow_mode_external(d)); + + // Is paging enabled on this vcpu? + if ( shadow_vcpu_mode_translate(v) ) + { + gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3))); + gmfn = vcpu_gfn_to_mfn(v, gfn); + ASSERT(valid_mfn(gmfn)); + ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn)); + } + else + { + /* Paging disabled: guest_table points at (part of) p2m */ +#if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */ + /* For everything else, they sould be the same */ + ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn); +#endif + } + } +#endif + + SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n", + d->domain_id, v->vcpu_id, + (unsigned long)pagetable_get_pfn(v->arch.guest_table)); + +#if GUEST_PAGING_LEVELS == 4 + if ( !(v->arch.flags & TF_kernel_mode) ) + gmfn = pagetable_get_mfn(v->arch.guest_table_user); + else +#endif + gmfn = pagetable_get_mfn(v->arch.guest_table); + + sh_detach_old_tables(v); + + if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) ) + { + ASSERT(v->arch.cr3 == 0); + return; + } + + //// + //// vcpu->arch.guest_vtable + //// + if ( shadow_mode_external(d) ) + { +#if GUEST_PAGING_LEVELS == 3 + if ( shadow_vcpu_mode_translate(v) ) + /* Paging enabled: find where in the page the l3 table is */ + guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3)); + else + /* Paging disabled: l3 is at the start of a page (in the p2m) */ + guest_idx = 0; + + // Ignore the low 2 bits of guest_idx -- they are really just + // cache control. + guest_idx &= ~3; + // XXX - why does this need a global map? + v->arch.guest_vtable = + (guest_l3e_t *)sh_map_domain_page_global(gmfn) + guest_idx; +#else + // XXX - why does this need a global map? + v->arch.guest_vtable = sh_map_domain_page_global(gmfn); +#endif + } + else + { +#ifdef __x86_64__ + v->arch.guest_vtable = __linear_l4_table; +#elif GUEST_PAGING_LEVELS == 3 + // XXX - why does this need a global map? + v->arch.guest_vtable = sh_map_domain_page_global(gmfn); +#else + v->arch.guest_vtable = __linear_l2_table; +#endif + } + +#if 0 + printk("%s %s %d gmfn=%05lx guest_vtable=%p\n", + __func__, __FILE__, __LINE__, gmfn, v->arch.guest_vtable); +#endif + + //// + //// vcpu->arch.shadow_table + //// + smfn = get_shadow_status(v, gmfn, PGC_SH_guest_root_type); + if ( valid_mfn(smfn) ) + { + /* Pull this root shadow to the front of the list of roots. */ + list_del(&mfn_to_page(smfn)->list); + list_add(&mfn_to_page(smfn)->list, &d->arch.shadow.toplevel_shadows); + } + else + { + /* This guest MFN is a pagetable. Must revoke write access. */ + if ( shadow_remove_write_access(v, gmfn, GUEST_PAGING_LEVELS, 0) + != 0 ) + flush_tlb_mask(d->domain_dirty_cpumask); + /* Make sure there's enough free shadow memory. */ + shadow_prealloc(d, SHADOW_MAX_ORDER); + /* Shadow the page. */ + smfn = sh_make_shadow(v, gmfn, PGC_SH_guest_root_type); + list_add(&mfn_to_page(smfn)->list, &d->arch.shadow.toplevel_shadows); + } + ASSERT(valid_mfn(smfn)); + v->arch.shadow_table = pagetable_from_mfn(smfn); + +#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW + /* Once again OK to unhook entries from this table if we see fork/exit */ + ASSERT(sh_mfn_is_a_page_table(gmfn)); + mfn_to_page(gmfn)->shadow_flags &= ~SHF_unhooked_mappings; +#endif + + + //// + //// vcpu->arch.shadow_vtable + //// + if ( shadow_mode_external(d) ) + { +#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3) + mfn_t adjusted_smfn = smfn; + u32 shadow_idx = shadow_l3_index(&adjusted_smfn, guest_idx); + // Q: why does this need to use (un)map_domain_page_*global* ? + v->arch.shadow_vtable = + (shadow_l3e_t *)sh_map_domain_page_global(adjusted_smfn) + + shadow_idx; +#else + // Q: why does this need to use (un)map_domain_page_*global* ? + v->arch.shadow_vtable = sh_map_domain_page_global(smfn); +#endif + } + else + { +#if SHADOW_PAGING_LEVELS == 4 + v->arch.shadow_vtable = __sh_linear_l4_table; +#elif GUEST_PAGING_LEVELS == 3 + // XXX - why does this need a global map? + v->arch.shadow_vtable = sh_map_domain_page_global(smfn); +#else + v->arch.shadow_vtable = __sh_linear_l2_table; +#endif + } + + //// + //// Take a ref to the new shadow table, and pin it. + //// + // + // This ref is logically "held" by v->arch.shadow_table entry itself. + // Release the old ref. + // +#if GUEST_PAGING_LEVELS == 3 + // PAE guests do not (necessarily) use an entire page for their + // 4-entry L3s, so we have to deal with them specially. + // + // XXX - might want to revisit this if/when we do multiple compilation for + // HVM-vs-PV guests, as PAE PV guests could get away without doing + // subshadows. + // + sh_get_ref_l3_subshadow(v->arch.shadow_vtable, smfn); + sh_pin_l3_subshadow(v->arch.shadow_vtable, smfn); +#else + sh_get_ref(smfn, 0); + sh_pin(smfn); +#endif + +#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3) + // PAE 3-on-3 shadows have to keep track of which vcpu's are using + // which l3 subshadow, in order handle the SHADOW_SET_L3PAE_RECOPY + // case from validate_gl3e(). Search for SHADOW_SET_L3PAE_RECOPY + // in the code for more info. + // + { + struct pae_l3_bookkeeping *info = + sl3p_to_info(v->arch.shadow_vtable); + ASSERT(!test_bit(v->vcpu_id, &info->vcpus)); + set_bit(v->vcpu_id, &info->vcpus); + } +#endif + + debugtrace_printk("%s cr3 gmfn=%05lx smfn=%05lx\n", + __func__, gmfn, smfn); + + /// + /// v->arch.cr3 and, if appropriate, v->arch.hvm_vcpu.hw_cr3 + /// + if ( shadow_mode_external(d) ) + { + ASSERT(hvm_guest(v)); + make_cr3(v, pagetable_get_pfn(v->arch.monitor_table)); + +#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2) +#if SHADOW_PAGING_LEVELS != 3 +#error unexpected combination of GUEST and SHADOW paging levels +#endif + /* 2-on-3: make a PAE l3 table that points at the four-page l2 */ + { + mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table); + int i; + + ASSERT(v->arch.hvm_vcpu.hw_cr3 == + virt_to_maddr(v->arch.hvm_vcpu.hvm_lowmem_l3tab)); + for (i = 0; i < 4; i++) + { + v->arch.hvm_vcpu.hvm_lowmem_l3tab[i] = + shadow_l3e_from_mfn(_mfn(mfn_x(smfn)+i), _PAGE_PRESENT); + } + } +#elif (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) + /* 3-on-3: copy the shadow l3 to slots that are below 4GB. + * If paging is disabled, clear l3e reserved bits; otherwise + * remove entries that have reserved bits set. */ + v->arch.hvm_vcpu.hw_cr3 = + hvm_pae_copy_root(v, v->arch.shadow_vtable, + !shadow_vcpu_mode_translate(v)); +#else + /* 2-on-2 or 4-on-4: just put the shadow top-level into cr3 */ + v->arch.hvm_vcpu.hw_cr3 = + pagetable_get_paddr(v->arch.shadow_table); +#endif + } + else // not shadow_mode_external... + { + /* We don't support PV except guest == shadow == config levels */ + BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS); + make_cr3(v, pagetable_get_pfn(v->arch.shadow_table)); + } + + /* Fix up the linear pagetable mappings */ + sh_update_linear_entries(v); +} + + +/**************************************************************************/ +/* Functions to revoke guest rights */ + +#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC +static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn) +/* Look up this vaddr in the current shadow and see if it's a writeable + * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */ +{ + shadow_l1e_t sl1e, *sl1p; + shadow_l2e_t *sl2p; +#if GUEST_PAGING_LEVELS >= 3 + shadow_l3e_t *sl3p; +#if GUEST_PAGING_LEVELS >= 4 + shadow_l4e_t *sl4p; +#endif +#endif + mfn_t sl1mfn; + + + /* Carefully look in the shadow linear map for the l1e we expect */ + if ( v->arch.shadow_vtable == NULL ) return 0; +#if GUEST_PAGING_LEVELS >= 4 + sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr); + if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) ) + return 0; + sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr); + if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) ) + return 0; +#elif GUEST_PAGING_LEVELS == 3 + sl3p = ((shadow_l3e_t *) v->arch.shadow_vtable) + + shadow_l3_linear_offset(vaddr); + if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) ) + return 0; +#endif + sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr); + if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) ) + return 0; + sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr); + sl1e = *sl1p; + if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW)) + != (_PAGE_PRESENT|_PAGE_RW)) + || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) ) + return 0; + + /* Found it! Need to remove its write permissions. */ + sl1mfn = shadow_l2e_get_mfn(*sl2p); + sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW); + shadow_set_l1e(v, sl1p, sl1e, sl1mfn); + return 1; +} +#endif + +int sh_remove_write_access(struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn) +/* Excises all writeable mappings to readonly_mfn from this l1 shadow table */ +{ + shadow_l1e_t *sl1e; + int done = 0; + int flags; + + SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, + { + flags = shadow_l1e_get_flags(*sl1e); + if ( (flags & _PAGE_PRESENT) + && (flags & _PAGE_RW) + && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) ) + { + shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn); + if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info + & PGT_count_mask) == 0 ) + /* This breaks us cleanly out of the FOREACH macro */ + done = 1; + } + }); + return done; +} + + +int sh_remove_all_mappings(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn) +/* Excises all mappings to guest frame from this shadow l1 table */ +{ + shadow_l1e_t *sl1e; + int done = 0; + int flags; + + SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, + { + flags = shadow_l1e_get_flags(*sl1e); + if ( (flags & _PAGE_PRESENT) + && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) ) + { + shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn); + if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 ) + /* This breaks us cleanly out of the FOREACH macro */ + done = 1; + } + }); + return done; +} + +/**************************************************************************/ +/* Functions to excise all pointers to shadows from higher-level shadows. */ + +void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn) +/* Blank out a single shadow entry */ +{ + switch (mfn_to_page(smfn)->count_info & PGC_SH_type_mask) + { + case PGC_SH_l1_shadow: + shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break; + case PGC_SH_l2_shadow: +#if GUEST_PAGING_LEVELS == 3 + case PGC_SH_l2h_shadow: +#endif + shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break; +#if GUEST_PAGING_LEVELS >= 3 + case PGC_SH_l3_shadow: + shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break; +#if GUEST_PAGING_LEVELS >= 4 + case PGC_SH_l4_shadow: + shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break; +#endif +#endif + default: BUG(); /* Called with the wrong kind of shadow. */ + } +} + +int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn) +/* Remove all mappings of this l1 shadow from this l2 shadow */ +{ + shadow_l2e_t *sl2e; + int done = 0; + int flags; +#if GUEST_PAGING_LEVELS != 4 + int xen_mappings = !shadow_mode_external(v->domain); +#endif + + SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, xen_mappings, + { + flags = shadow_l2e_get_flags(*sl2e); + if ( (flags & _PAGE_PRESENT) + && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) ) + { + shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn); + if ( (mfn_to_page(sl1mfn)->count_info & PGC_SH_type_mask) == 0 ) + /* This breaks us cleanly out of the FOREACH macro */ + done = 1; + } + }); + return done; +} + +#if GUEST_PAGING_LEVELS >= 3 +int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn) +/* Remove all mappings of this l2 shadow from this l3 shadow */ +{ + shadow_l3e_t *sl3e; + int done = 0; + int flags; + + SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done, + { + flags = shadow_l3e_get_flags(*sl3e); + if ( (flags & _PAGE_PRESENT) + && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) ) + { + shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn); + if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH_type_mask) == 0 ) + /* This breaks us cleanly out of the FOREACH macro */ + done = 1; + } + }); + return done; +} + +#if GUEST_PAGING_LEVELS >= 4 +int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn) +/* Remove all mappings of this l3 shadow from this l4 shadow */ +{ + shadow_l4e_t *sl4e; + int done = 0; + int flags, xen_mappings = !shadow_mode_external(v->domain); + + SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, xen_mappings, + { + flags = shadow_l4e_get_flags(*sl4e); + if ( (flags & _PAGE_PRESENT) + && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) ) + { + shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn); + if ( (mfn_to_page(sl3mfn)->count_info & PGC_SH_type_mask) == 0 ) + /* This breaks us cleanly out of the FOREACH macro */ + done = 1; + } + }); + return done; +} +#endif /* 64bit guest */ +#endif /* PAE guest */ + +/**************************************************************************/ +/* Handling HVM guest writes to pagetables */ + +/* Check that the user is allowed to perform this write. + * Returns a mapped pointer to write to, and the mfn it's on, + * or NULL for error. */ +static inline void * emulate_map_dest(struct vcpu *v, + unsigned long vaddr, + struct x86_emulate_ctxt *ctxt, + mfn_t *mfnp) +{ + walk_t gw; + u32 flags; + gfn_t gfn; + mfn_t mfn; + + guest_walk_tables(v, vaddr, &gw, 1); + flags = accumulate_guest_flags(&gw); + gfn = guest_l1e_get_gfn(gw.eff_l1e); + mfn = vcpu_gfn_to_mfn(v, gfn); + sh_audit_gw(v, &gw); + unmap_walk(v, &gw); + + if ( !(flags & _PAGE_PRESENT) + || !(flags & _PAGE_RW) + || (!(flags & _PAGE_USER) && ring_3(ctxt->regs)) ) + { + /* This write would have faulted even on bare metal */ + v->arch.shadow.propagate_fault = 1; + return NULL; + } + + if ( !valid_mfn(mfn) ) + { + /* Attempted a write to a bad gfn. This should never happen: + * after all, we're here because this write is to a page table. */ + BUG(); + } + + ASSERT(sh_mfn_is_a_page_table(mfn)); + *mfnp = mfn; + return sh_map_domain_page(mfn) + (vaddr & ~PAGE_MASK); +} + +int +sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src, + u32 bytes, struct x86_emulate_ctxt *ctxt) +{ + ASSERT(shadow_lock_is_acquired(v->domain)); + while ( bytes > 0 ) + { + mfn_t mfn; + int bytes_on_page; + void *addr; + + bytes_on_page = PAGE_SIZE - (vaddr & ~PAGE_MASK); + if ( bytes_on_page > bytes ) + bytes_on_page = bytes; + + if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL ) + return X86EMUL_PROPAGATE_FAULT; + memcpy(addr, src, bytes_on_page); + shadow_validate_guest_pt_write(v, mfn, addr, bytes_on_page); + bytes -= bytes_on_page; + /* If we are writing zeros to this page, might want to unshadow */ + if ( *(u8 *)addr == 0 ) + check_for_early_unshadow(v, mfn); + sh_unmap_domain_page(addr); + } + shadow_audit_tables(v); + return X86EMUL_CONTINUE; +} + +int +sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr, + unsigned long old, unsigned long new, + unsigned int bytes, struct x86_emulate_ctxt *ctxt) +{ + mfn_t mfn; + void *addr; + unsigned long prev; + int rv = X86EMUL_CONTINUE; + + ASSERT(shadow_lock_is_acquired(v->domain)); + ASSERT(bytes <= sizeof (unsigned long)); + + if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL ) + return X86EMUL_PROPAGATE_FAULT; + + switch (bytes) + { + case 1: prev = cmpxchg(((u8 *)addr), old, new); break; + case 2: prev = cmpxchg(((u16 *)addr), old, new); break; + case 4: prev = cmpxchg(((u32 *)addr), old, new); break; + case 8: prev = cmpxchg(((u64 *)addr), old, new); break; + default: + SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes); + prev = ~old; + } + + if ( (prev == old) ) + shadow_validate_guest_pt_write(v, mfn, addr, bytes); + else + rv = X86EMUL_CMPXCHG_FAILED; + + SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx" + " wanted %#lx now %#lx bytes %u\n", + vaddr, prev, old, new, *(unsigned long *)addr, bytes); + + /* If we are writing zeros to this page, might want to unshadow */ + if ( *(u8 *)addr == 0 ) + check_for_early_unshadow(v, mfn); + + sh_unmap_domain_page(addr); + shadow_audit_tables(v); + check_for_early_unshadow(v, mfn); + return rv; +} + +int +sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr, + unsigned long old_lo, unsigned long old_hi, + unsigned long new_lo, unsigned long new_hi, + struct x86_emulate_ctxt *ctxt) +{ + mfn_t mfn; + void *addr; + u64 old, new, prev; + int rv = X86EMUL_CONTINUE; + + ASSERT(shadow_lock_is_acquired(v->domain)); + + if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL ) + return X86EMUL_PROPAGATE_FAULT; + + old = (((u64) old_hi) << 32) | (u64) old_lo; + new = (((u64) new_hi) << 32) | (u64) new_lo; + prev = cmpxchg(((u64 *)addr), old, new); + + if ( (prev == old) ) + shadow_validate_guest_pt_write(v, mfn, addr, 8); + else + rv = X86EMUL_CMPXCHG_FAILED; + + /* If we are writing zeros to this page, might want to unshadow */ + if ( *(u8 *)addr == 0 ) + check_for_early_unshadow(v, mfn); + + sh_unmap_domain_page(addr); + shadow_audit_tables(v); + check_for_early_unshadow(v, mfn); + return rv; +} + + +/**************************************************************************/ +/* Audit tools */ + +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES + +#define AUDIT_FAIL(_level, _fmt, _a...) do { \ + printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \ + "gl" #_level "mfn = %" SH_PRI_mfn \ + " sl" #_level "mfn = %" SH_PRI_mfn \ + " &gl" #_level "e = %p &sl" #_level "e = %p" \ + " gl" #_level "e = %" SH_PRI_gpte \ + " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \ + GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ + _level, guest_index(gl ## _level ## e), \ + mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ + gl ## _level ## e, sl ## _level ## e, \ + gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \ + ##_a); \ + BUG(); \ + done = 1; \ +} while (0) + + +static char * sh_audit_flags(struct vcpu *v, int level, + int gflags, int sflags) +/* Common code for auditing flag bits */ +{ + if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) ) + return "shadow is present but guest is not present"; + if ( (sflags & _PAGE_GLOBAL) && !hvm_guest(v) ) + return "global bit set in PV shadow"; + if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE))) + && ((sflags & _PAGE_DIRTY) && !(gflags & _PAGE_DIRTY)) ) + return "dirty bit not propagated"; + if ( level == 2 && (sflags & _PAGE_PSE) ) + return "PS bit set in shadow"; +#if SHADOW_PAGING_LEVELS == 3 + if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */ +#endif + if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) ) + return "user/supervisor bit does not match"; + if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) ) + return "NX bit does not match"; + if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) ) + return "shadow grants write access but guest does not"; + if ( (sflags & _PAGE_ACCESSED) && !(gflags & _PAGE_ACCESSED) ) + return "accessed bit not propagated"; + return NULL; +} + +static inline mfn_t +audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn) +/* Convert this gfn to an mfn in the manner appropriate for the + * guest pagetable it's used in (gmfn) */ +{ + if ( !shadow_mode_translate(v->domain) ) + return _mfn(gfn_x(gfn)); + + if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask) + != PGT_writable_page ) + return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */ + else + return sh_gfn_to_mfn(v->domain, gfn_x(gfn)); +} + + +int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x) +{ + guest_l1e_t *gl1e, *gp; + shadow_l1e_t *sl1e; + mfn_t mfn, gmfn, gl1mfn; + gfn_t gfn; + char *s; + int done = 0; + + /* Follow the backpointer */ + gl1mfn = _mfn(mfn_to_page(sl1mfn)->u.inuse.type_info); + gl1e = gp = sh_map_domain_page(gl1mfn); + SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, { + + s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e), + shadow_l1e_get_flags(*sl1e)); + if ( s ) AUDIT_FAIL(1, "%s", s); + + if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS ) + { + gfn = guest_l1e_get_gfn(*gl1e); + mfn = shadow_l1e_get_mfn(*sl1e); + gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn); + if ( mfn_x(gmfn) != mfn_x(mfn) ) + AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn + " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n", + gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn)); + } + }); + sh_unmap_domain_page(gp); + return done; +} + +int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x) +{ + guest_l1e_t *gl1e, e; + shadow_l1e_t *sl1e; + mfn_t gl1mfn = _mfn(INVALID_MFN); + int f; + int done = 0; + + /* fl1 has no useful backpointer: all we can check are flags */ + e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */ + SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, { + f = shadow_l1e_get_flags(*sl1e); + f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2); + if ( !(f == 0 + || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| + _PAGE_ACCESSED|_PAGE_DIRTY) + || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)) ) + AUDIT_FAIL(1, "fl1e has bad flags"); + }); + return 0; +} + +int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x) +{ + guest_l2e_t *gl2e, *gp; + shadow_l2e_t *sl2e; + mfn_t mfn, gmfn, gl2mfn; + gfn_t gfn; + char *s; + int done = 0; +#if GUEST_PAGING_LEVELS != 4 + int xen_mappings = !shadow_mode_external(v->domain); +#endif + + /* Follow the backpointer */ + gl2mfn = _mfn(mfn_to_page(sl2mfn)->u.inuse.type_info); + gl2e = gp = sh_map_domain_page(gl2mfn); + SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, xen_mappings, { + + s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e), + shadow_l2e_get_flags(*sl2e)); + if ( s ) AUDIT_FAIL(2, "%s", s); + + if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS ) + { + gfn = guest_l2e_get_gfn(*gl2e); + mfn = shadow_l2e_get_mfn(*sl2e); + gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) + ? get_fl1_shadow_status(v, gfn) + : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn), + PGC_SH_l1_shadow); + if ( mfn_x(gmfn) != mfn_x(mfn) ) + AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn + " (--> %" SH_PRI_mfn ")" + " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n", + gfn_x(gfn), + (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0 + : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)), + mfn_x(gmfn), mfn_x(mfn)); + } + }); + sh_unmap_domain_page(gp); + return 0; +} + +#if GUEST_PAGING_LEVELS >= 3 +int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x) +{ + guest_l3e_t *gl3e, *gp; + shadow_l3e_t *sl3e; + mfn_t mfn, gmfn, gl3mfn; + gfn_t gfn; + char *s; + int done = 0; + + /* Follow the backpointer */ + gl3mfn = _mfn(mfn_to_page(sl3mfn)->u.inuse.type_info); + gl3e = gp = sh_map_domain_page(gl3mfn); + SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, { + + s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e), + shadow_l3e_get_flags(*sl3e)); + if ( s ) AUDIT_FAIL(3, "%s", s); + + if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS ) + { + gfn = guest_l3e_get_gfn(*gl3e); + mfn = shadow_l3e_get_mfn(*sl3e); + gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn), + (GUEST_PAGING_LEVELS == 3 + && !shadow_mode_external(v->domain) + && (guest_index(gl3e) % 4) == 3) + ? PGC_SH_l2h_pae_shadow + : PGC_SH_l2_shadow); + if ( mfn_x(gmfn) != mfn_x(mfn) ) + AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn + " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n", + gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn)); + } + }); + sh_unmap_domain_page(gp); + return 0; +} +#endif /* GUEST_PAGING_LEVELS >= 3 */ + +#if GUEST_PAGING_LEVELS >= 4 +int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x) +{ + guest_l4e_t *gl4e, *gp; + shadow_l4e_t *sl4e; + mfn_t mfn, gmfn, gl4mfn; + gfn_t gfn; + char *s; + int done = 0; + int xen_mappings = !shadow_mode_external(v->domain); + + /* Follow the backpointer */ + gl4mfn = _mfn(mfn_to_page(sl4mfn)->u.inuse.type_info); + gl4e = gp = sh_map_domain_page(gl4mfn); + SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, xen_mappings, + { + s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e), + shadow_l4e_get_flags(*sl4e)); + if ( s ) AUDIT_FAIL(4, "%s", s); + + if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS ) + { + gfn = guest_l4e_get_gfn(*gl4e); + mfn = shadow_l4e_get_mfn(*sl4e); + gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn), + PGC_SH_l3_shadow); + if ( mfn_x(gmfn) != mfn_x(mfn) ) + AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn + " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n", + gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn)); + } + }); + sh_unmap_domain_page(gp); + return 0; +} +#endif /* GUEST_PAGING_LEVELS >= 4 */ + + +#undef AUDIT_FAIL + +#endif /* Audit code */ + +/**************************************************************************/ +/* Entry points into this mode of the shadow code. + * This will all be mangled by the preprocessor to uniquify everything. */ +struct shadow_paging_mode sh_paging_mode = { + .page_fault = sh_page_fault, + .invlpg = sh_invlpg, + .gva_to_gpa = sh_gva_to_gpa, + .gva_to_gfn = sh_gva_to_gfn, + .update_cr3 = sh_update_cr3, + .map_and_validate_gl1e = sh_map_and_validate_gl1e, + .map_and_validate_gl2e = sh_map_and_validate_gl2e, + .map_and_validate_gl2he = sh_map_and_validate_gl2he, + .map_and_validate_gl3e = sh_map_and_validate_gl3e, + .map_and_validate_gl4e = sh_map_and_validate_gl4e, + .detach_old_tables = sh_detach_old_tables, + .x86_emulate_write = sh_x86_emulate_write, + .x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg, + .x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b, + .make_monitor_table = sh_make_monitor_table, + .destroy_monitor_table = sh_destroy_monitor_table, +#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC + .guess_wrmap = sh_guess_wrmap, +#endif + .guest_levels = GUEST_PAGING_LEVELS, + .shadow_levels = SHADOW_PAGING_LEVELS, +}; + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/arch/x86/mm/shadow/multi.h b/xen/arch/x86/mm/shadow/multi.h new file mode 100644 index 0000000000..26a4675a71 --- /dev/null +++ b/xen/arch/x86/mm/shadow/multi.h @@ -0,0 +1,116 @@ +/****************************************************************************** + * arch/x86/mm/shadow/multi.h + * + * Shadow declarations which will be multiply compiled. + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +extern int +SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t gl1mfn, void *new_gl1p, u32 size); +extern int +SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size); +extern int +SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size); +extern int +SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t gl3mfn, void *new_gl3p, u32 size); +extern int +SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t gl4mfn, void *new_gl4p, u32 size); + +extern void +SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t smfn); +extern void +SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t smfn); +extern void +SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t smfn); +extern void +SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t smfn); + +extern void +SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows, 3, 3) + (struct vcpu *v, mfn_t smfn); + +extern void +SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl2mfn); +extern void +SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl3mfn); +extern void +SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl4mfn); + +extern int +SHADOW_INTERNAL_NAME(sh_remove_write_access, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn); +extern int +SHADOW_INTERNAL_NAME(sh_remove_all_mappings, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn); + +extern void +SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, void *ep, mfn_t smfn); + +extern int +SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn); +extern int +SHADOW_INTERNAL_NAME(sh_remove_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn); +extern int +SHADOW_INTERNAL_NAME(sh_remove_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn); + +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES +int +SHADOW_INTERNAL_NAME(sh_audit_l1_table, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl1mfn, mfn_t x); +int +SHADOW_INTERNAL_NAME(sh_audit_fl1_table, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl1mfn, mfn_t x); +int +SHADOW_INTERNAL_NAME(sh_audit_l2_table, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl2mfn, mfn_t x); +int +SHADOW_INTERNAL_NAME(sh_audit_l3_table, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl3mfn, mfn_t x); +int +SHADOW_INTERNAL_NAME(sh_audit_l4_table, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl4mfn, mfn_t x); +#endif + +#if SHADOW_LEVELS == GUEST_LEVELS +extern mfn_t +SHADOW_INTERNAL_NAME(sh_make_monitor_table, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v); +extern void +SHADOW_INTERNAL_NAME(sh_destroy_monitor_table, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t mmfn); +#endif + +extern struct shadow_paging_mode +SHADOW_INTERNAL_NAME(sh_paging_mode, SHADOW_LEVELS, GUEST_LEVELS); diff --git a/xen/arch/x86/mm/shadow/page-guest32.h b/xen/arch/x86/mm/shadow/page-guest32.h new file mode 100644 index 0000000000..e93206169a --- /dev/null +++ b/xen/arch/x86/mm/shadow/page-guest32.h @@ -0,0 +1,105 @@ + +#ifndef __X86_PAGE_GUEST_H__ +#define __X86_PAGE_GUEST_H__ + +#ifndef __ASSEMBLY__ +# include +#endif + +#define PAGETABLE_ORDER_32 10 +#define L1_PAGETABLE_ENTRIES_32 (1<> L1_PAGETABLE_SHIFT_32) & (L1_PAGETABLE_ENTRIES_32 - 1)) +#define l2_table_offset_32(a) \ + (((a) >> L2_PAGETABLE_SHIFT_32) & (L2_PAGETABLE_ENTRIES_32 - 1)) + +#define linear_l1_table_32 \ + ((l1_pgentry_32_t *)(LINEAR_PT_VIRT_START)) + +#define linear_pg_table_32 linear_l1_table_32 + +#endif /* __X86_PAGE_GUEST_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h new file mode 100644 index 0000000000..f470a874ba --- /dev/null +++ b/xen/arch/x86/mm/shadow/private.h @@ -0,0 +1,593 @@ +/****************************************************************************** + * arch/x86/mm/shadow/private.h + * + * Shadow code that is private, and does not need to be multiply compiled. + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _XEN_SHADOW_PRIVATE_H +#define _XEN_SHADOW_PRIVATE_H + +// In order to override the definition of mfn_to_page, we make sure page.h has +// been included... +#include +#include +#include +#include + + +/****************************************************************************** + * Definitions for the use of the "available" bits in the shadow PTEs. + * + * Review of the low 12 bits of a shadow page table entry: + * + * in a guest: in a shadow: + * Bit 11: _PAGE_AVAIL2, aka _PAGE_GNTTAB + * Bit 10: _PAGE_AVAIL1 _PAGE_SHADOW_RW ("SW" below) + * Bit 9: _PAGE_AVAIL0 _PAGE_SHADOW_PRESENT ("SP" below) + * Bit 8: _PAGE_GLOBAL _PAGE_SHADOW_MMIO ("MMIO" below), + * aka _PAGE_SHADOW_GUEST_NOT_PRESENT + * Bit 7: _PAGE_PSE, aka _PAGE_PAT + * Bit 6: _PAGE_DIRTY + * Bit 5: _PAGE_ACCESSED + * Bit 4: _PAGE_PCD + * Bit 3: _PAGE_PWT + * Bit 2: _PAGE_USER + * Bit 1: _PAGE_RW ("GW" below) + * Bit 0: _PAGE_PRESENT ("GP" below) + * + * Given a guest entry, as shown below, we can expect the following in the + * corresponding shadow entry: + * + * Guest entry Shadow entry Commentary + * ----------- ---------------- --------------------------------------------- + * Maps + * GP GW IO GP SP GW SW MMIO + * -- -- ---- -- -- -- -- ---- + * - - - 0 0 0 0 0 The guest entry has not yet been shadowed. + * 0 - - 0 0 0 0 1 The guest entry is marked not-present. + * 1 1 no ? 1 ? 1 0 Writable entry in the guest. + * 1 0 no ? 1 0 0 0 Read-only entry in the guest. + * 1 1 yes 0 1 ? 1 1 Writable MMIO mapping in the guest. + * 1 0 yes 0 1 0 0 1 Read-only MMIO mapping in the guest. + * + * Normally, we would expect that GP=1 in the guest to imply GP=1 in the + * shadow, and similarly for GW=1. However, various functionality that may be + * implemented via the shadow can cause GP or GW to be cleared in such cases. + * A & D bit emulation is a prime example of such functionality. + * + * If _PAGE_SHADOW_PRESENT is zero, then the _PAGE_PRESENT bit in that same + * entry will always be zero, too. + + * Bit 11 is used in debug builds as the _PAGE_GNTTAB bit in PV guests. It is + * currently available for random (ab)use in shadow entries. + * + * Bit 8 (the global bit) could be propagated from an HVM guest to the shadow, + * but currently there is no benefit, as the guest's TLB is flushed on every + * transition of CR3 anyway due to the HVM exit/re-entry. + * + * In shadow entries in which the _PAGE_SHADOW_PRESENT is set, bit 8 is used + * as the _PAGE_SHADOW_MMIO bit. In such entries, if _PAGE_SHADOW_MMIO is + * set, then the entry contains the *gfn* directly from the corresponding + * guest entry (not an mfn!!). + * + * Bit 7 is set in a guest L2 to signify a superpage entry. The current + * shadow code splinters superpage mappings into 512 or 1024 4K mappings; the + * resulting shadow L1 table is called an FL1. Note that there is no guest + * page that corresponds to an FL1. + * + * Bit 7 in a guest L1 is the PAT2 bit. Currently we do not support PAT in + * this shadow code. + * + * Bit 6 is the dirty bit. + * + * Bit 5 is the accessed bit. + * + * Bit 4 is the cache disable bit. If set in a guest, the hardware is + * supposed to refuse to cache anything found via this entry. It can be set + * in an L4e, L3e, L2e, or L1e. This shadow code currently does not support + * cache disable bits. They are silently ignored. + * + * Bit 4 is a guest L1 is also the PAT1 bit. Currently we do not support PAT + * in this shadow code. + * + * Bit 3 is the cache write-thru bit. If set in a guest, the hardware is + * supposed to use write-thru instead of write-back caching for anything found + * via this entry. It can be set in an L4e, L3e, L2e, or L1e. This shadow + * code currently does not support cache write-thru bits. They are silently + * ignored. + * + * Bit 3 is a guest L1 is also the PAT0 bit. Currently we do not support PAT + * in this shadow code. + * + * Bit 2 is the user bit. + * + * Bit 1 is the read-write bit. + * + * Bit 0 is the present bit. + */ + +// Copy of the _PAGE_RW bit from the guest's PTE, appropriately zero'ed by +// the appropriate shadow rules. +#define _PAGE_SHADOW_RW _PAGE_AVAIL1 + +// Copy of the _PAGE_PRESENT bit from the guest's PTE +#define _PAGE_SHADOW_PRESENT _PAGE_AVAIL0 + +// The matching guest entry maps MMIO space +#define _PAGE_SHADOW_MMIO _PAGE_GLOBAL + +// Shadow flags value used when the guest is not present +#define _PAGE_SHADOW_GUEST_NOT_PRESENT _PAGE_GLOBAL + + +/****************************************************************************** + * Debug and error-message output + */ +#define SHADOW_PRINTK(_f, _a...) \ + debugtrace_printk("sh: %s(): " _f, __func__, ##_a) +#define SHADOW_ERROR(_f, _a...) \ + printk("sh error: %s(): " _f, __func__, ##_a) +#define SHADOW_DEBUG(flag, _f, _a...) \ + do { \ + if (SHADOW_DEBUG_ ## flag) \ + debugtrace_printk("shdebug: %s(): " _f, __func__, ##_a); \ + } while (0) + +// The flags for use with SHADOW_DEBUG: +#define SHADOW_DEBUG_PROPAGATE 0 +#define SHADOW_DEBUG_MAKE_SHADOW 0 +#define SHADOW_DEBUG_DESTROY_SHADOW 0 +#define SHADOW_DEBUG_P2M 0 +#define SHADOW_DEBUG_A_AND_D 0 +#define SHADOW_DEBUG_EMULATE 0 +#define SHADOW_DEBUG_LOGDIRTY 1 + + +/****************************************************************************** + * Auditing routines + */ + +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL +extern void shadow_audit_tables(struct vcpu *v); +#else +#define shadow_audit_tables(_v) do {} while(0) +#endif + +#if SHADOW_AUDIT & SHADOW_AUDIT_P2M +extern void shadow_audit_p2m(struct domain *d); +#else +#define shadow_audit_p2m(_d) do {} while(0) +#endif + + +/****************************************************************************** + * Mechanism for double-checking the optimized pagefault path: this + * structure contains a record of actions taken by the fault handling + * code. In paranoid mode, the fast-path code fills out one of these + * structures (but doesn't take any actual action) and then the normal + * path fills in another. When the fault handler finishes, the + * two are compared */ + +#ifdef SHADOW_OPTIMIZATION_PARANOIA + +typedef struct shadow_action_log sh_log_t; +struct shadow_action_log { + paddr_t ad[CONFIG_PAGING_LEVELS]; /* A & D bits propagated here */ + paddr_t mmio; /* Address of an mmio operation */ + int rv; /* Result of the fault handler */ +}; + +/* There are two logs, one for the fast path, one for the normal path */ +enum sh_log_type { log_slow = 0, log_fast= 1 }; + +/* Alloc and zero the logs */ +static inline void sh_init_log(struct vcpu *v) +{ + if ( unlikely(!v->arch.shadow.action_log) ) + v->arch.shadow.action_log = xmalloc_array(sh_log_t, 2); + ASSERT(v->arch.shadow.action_log); + memset(v->arch.shadow.action_log, 0, 2 * sizeof (sh_log_t)); +} + +/* Log an A&D-bit update */ +static inline void sh_log_ad(struct vcpu *v, paddr_t e, unsigned int level) +{ + v->arch.shadow.action_log[v->arch.shadow.action_index].ad[level] = e; +} + +/* Log an MMIO address */ +static inline void sh_log_mmio(struct vcpu *v, paddr_t m) +{ + v->arch.shadow.action_log[v->arch.shadow.action_index].mmio = m; +} + +/* Log the result */ +static inline void sh_log_rv(struct vcpu *v, int rv) +{ + v->arch.shadow.action_log[v->arch.shadow.action_index].rv = rv; +} + +/* Set which mode we're in */ +static inline void sh_set_log_mode(struct vcpu *v, enum sh_log_type t) +{ + v->arch.shadow.action_index = t; +} + +/* Know not to take action, because we're only checking the mechanism */ +static inline int sh_take_no_action(struct vcpu *v) +{ + return (v->arch.shadow.action_index == log_fast); +} + +#else /* Non-paranoid mode: these logs do not exist */ + +#define sh_init_log(_v) do { (void)(_v); } while(0) +#define sh_set_log_mode(_v,_t) do { (void)(_v); } while(0) +#define sh_log_ad(_v,_e,_l) do { (void)(_v),(void)(_e),(void)(_l); } while (0) +#define sh_log_mmio(_v,_m) do { (void)(_v),(void)(_m); } while (0) +#define sh_log_rv(_v,_r) do { (void)(_v),(void)(_r); } while (0) +#define sh_take_no_action(_v) (((void)(_v)), 0) + +#endif /* SHADOW_OPTIMIZATION_PARANOIA */ + + +/****************************************************************************** + * Macro for dealing with the naming of the internal names of the + * shadow code's external entry points. + */ +#define SHADOW_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels) \ + name ## __shadow_ ## shadow_levels ## _guest_ ## guest_levels +#define SHADOW_INTERNAL_NAME(name, shadow_levels, guest_levels) \ + SHADOW_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels) + +#if CONFIG_PAGING_LEVELS == 2 +#define GUEST_LEVELS 2 +#define SHADOW_LEVELS 2 +#include "multi.h" +#undef GUEST_LEVELS +#undef SHADOW_LEVELS +#endif /* CONFIG_PAGING_LEVELS == 2 */ + +#if CONFIG_PAGING_LEVELS == 3 +#define GUEST_LEVELS 2 +#define SHADOW_LEVELS 3 +#include "multi.h" +#undef GUEST_LEVELS +#undef SHADOW_LEVELS + +#define GUEST_LEVELS 3 +#define SHADOW_LEVELS 3 +#include "multi.h" +#undef GUEST_LEVELS +#undef SHADOW_LEVELS +#endif /* CONFIG_PAGING_LEVELS == 3 */ + +#if CONFIG_PAGING_LEVELS == 4 +#define GUEST_LEVELS 2 +#define SHADOW_LEVELS 3 +#include "multi.h" +#undef GUEST_LEVELS +#undef SHADOW_LEVELS + +#define GUEST_LEVELS 3 +#define SHADOW_LEVELS 3 +#include "multi.h" +#undef GUEST_LEVELS +#undef SHADOW_LEVELS + +#define GUEST_LEVELS 3 +#define SHADOW_LEVELS 4 +#include "multi.h" +#undef GUEST_LEVELS +#undef SHADOW_LEVELS + +#define GUEST_LEVELS 4 +#define SHADOW_LEVELS 4 +#include "multi.h" +#undef GUEST_LEVELS +#undef SHADOW_LEVELS +#endif /* CONFIG_PAGING_LEVELS == 4 */ + + +/****************************************************************************** + * Various function declarations + */ + +/* x86 emulator support */ +extern struct x86_emulate_ops shadow_emulator_ops; + +/* Hash table functions */ +mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, u8 t); +void shadow_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn); +void shadow_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn); + +/* shadow promotion */ +void shadow_promote(struct vcpu *v, mfn_t gmfn, u32 type); +void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type); + +/* Shadow page allocation functions */ +void shadow_prealloc(struct domain *d, unsigned int order); +mfn_t shadow_alloc(struct domain *d, + u32 shadow_type, + unsigned long backpointer); +void shadow_free(struct domain *d, mfn_t smfn); + +/* Function to convert a shadow to log-dirty */ +void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn); + +/* Dispatcher function: call the per-mode function that will unhook the + * non-Xen mappings in this top-level shadow mfn */ +void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn); + +/* Re-sync copies of PAE shadow L3 tables if they have been changed */ +void sh_pae_recopy(struct domain *d); + +/* Install the xen mappings in various flavours of shadow */ +void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn); +void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn); +void sh_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn); +void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn); + + +/****************************************************************************** + * MFN/page-info handling + */ + +// Override mfn_to_page from asm/page.h, which was #include'd above, +// in order to make it work with our mfn type. +#undef mfn_to_page +#define mfn_to_page(_mfn) (frame_table + mfn_x(_mfn)) + +// Override page_to_mfn from asm/page.h, which was #include'd above, +// in order to make it work with our mfn type. +#undef page_to_mfn +#define page_to_mfn(_pg) (_mfn((_pg) - frame_table)) + +// Override mfn_valid from asm/page.h, which was #include'd above, +// in order to make it work with our mfn type. +#undef mfn_valid +#define mfn_valid(_mfn) (mfn_x(_mfn) < max_page) + +// Provide mfn_t-aware versions of common xen functions +static inline void * +sh_map_domain_page(mfn_t mfn) +{ + /* XXX Using the monitor-table as a map will happen here */ + return map_domain_page(mfn_x(mfn)); +} + +static inline void +sh_unmap_domain_page(void *p) +{ + /* XXX Using the monitor-table as a map will happen here */ + unmap_domain_page(p); +} + +static inline void * +sh_map_domain_page_global(mfn_t mfn) +{ + /* XXX Using the monitor-table as a map will happen here */ + return map_domain_page_global(mfn_x(mfn)); +} + +static inline void +sh_unmap_domain_page_global(void *p) +{ + /* XXX Using the monitor-table as a map will happen here */ + unmap_domain_page_global(p); +} + +static inline int +sh_mfn_is_dirty(struct domain *d, mfn_t gmfn) +/* Is this guest page dirty? Call only in log-dirty mode. */ +{ + unsigned long pfn; + ASSERT(shadow_mode_log_dirty(d)); + ASSERT(d->arch.shadow.dirty_bitmap != NULL); + + /* We /really/ mean PFN here, even for non-translated guests. */ + pfn = get_gpfn_from_mfn(mfn_x(gmfn)); + if ( likely(VALID_M2P(pfn)) + && likely(pfn < d->arch.shadow.dirty_bitmap_size) + && test_bit(pfn, d->arch.shadow.dirty_bitmap) ) + return 1; + + return 0; +} + +static inline int +sh_mfn_is_a_page_table(mfn_t gmfn) +{ + struct page_info *page = mfn_to_page(gmfn); + struct domain *owner; + unsigned long type_info; + + if ( !valid_mfn(gmfn) ) + return 0; + + owner = page_get_owner(page); + if ( owner && shadow_mode_refcounts(owner) + && (page->count_info & PGC_page_table) ) + return 1; + + type_info = page->u.inuse.type_info & PGT_type_mask; + return type_info && (type_info <= PGT_l4_page_table); +} + + +/**************************************************************************/ +/* Shadow-page refcounting. See comment in shadow-common.c about the + * use of struct page_info fields for shadow pages */ + +void sh_destroy_shadow(struct vcpu *v, mfn_t smfn); + +/* Increase the refcount of a shadow page. Arguments are the mfn to refcount, + * and the physical address of the shadow entry that holds the ref (or zero + * if the ref is held by something else) */ +static inline void sh_get_ref(mfn_t smfn, paddr_t entry_pa) +{ + u32 x, nx; + struct page_info *page = mfn_to_page(smfn); + + ASSERT(mfn_valid(smfn)); + + x = page->count_info & PGC_SH_count_mask; + nx = x + 1; + + if ( unlikely(nx & ~PGC_SH_count_mask) ) + { + SHADOW_PRINTK("shadow ref overflow, gmfn=%" PRtype_info " smfn=%lx\n", + page->u.inuse.type_info, mfn_x(smfn)); + domain_crash_synchronous(); + } + + /* Guarded by the shadow lock, so no need for atomic update */ + page->count_info &= ~PGC_SH_count_mask; + page->count_info |= nx; + + /* We remember the first shadow entry that points to each shadow. */ + if ( entry_pa != 0 && page->up == 0 ) + page->up = entry_pa; +} + + +/* Decrease the refcount of a shadow page. As for get_ref, takes the + * physical address of the shadow entry that held this reference. */ +static inline void sh_put_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa) +{ + u32 x, nx; + struct page_info *page = mfn_to_page(smfn); + + ASSERT(mfn_valid(smfn)); + ASSERT(page_get_owner(page) == NULL); + + /* If this is the entry in the up-pointer, remove it */ + if ( entry_pa != 0 && page->up == entry_pa ) + page->up = 0; + + x = page->count_info & PGC_SH_count_mask; + nx = x - 1; + + if ( unlikely(x == 0) ) + { + SHADOW_PRINTK("shadow ref underflow, smfn=%lx oc=%08x t=%" + PRtype_info "\n", + mfn_x(smfn), + page->count_info & PGC_SH_count_mask, + page->u.inuse.type_info); + domain_crash_synchronous(); + } + + /* Guarded by the shadow lock, so no need for atomic update */ + page->count_info &= ~PGC_SH_count_mask; + page->count_info |= nx; + + if ( unlikely(nx == 0) ) + sh_destroy_shadow(v, smfn); +} + + +/* Pin a shadow page: take an extra refcount and set the pin bit. */ +static inline void sh_pin(mfn_t smfn) +{ + struct page_info *page; + + ASSERT(mfn_valid(smfn)); + page = mfn_to_page(smfn); + if ( !(page->count_info & PGC_SH_pinned) ) + { + sh_get_ref(smfn, 0); + page->count_info |= PGC_SH_pinned; + } +} + +/* Unpin a shadow page: unset the pin bit and release the extra ref. */ +static inline void sh_unpin(struct vcpu *v, mfn_t smfn) +{ + struct page_info *page; + + ASSERT(mfn_valid(smfn)); + page = mfn_to_page(smfn); + if ( page->count_info & PGC_SH_pinned ) + { + page->count_info &= ~PGC_SH_pinned; + sh_put_ref(v, smfn, 0); + } +} + +/**************************************************************************/ +/* Guest physmap (p2m) support */ + +/* Read our own P2M table, checking in the linear pagetables first to be + * sure that we will succeed. Call this function if you expect it to + * fail often, as it avoids page faults. If you expect to succeed, use + * vcpu_gfn_to_mfn, which copy_from_user()s the entry */ +static inline mfn_t +vcpu_gfn_to_mfn_nofault(struct vcpu *v, unsigned long gfn) +{ + unsigned long entry_addr = (unsigned long) &phys_to_machine_mapping[gfn]; +#if CONFIG_PAGING_LEVELS >= 4 + l4_pgentry_t *l4e; + l3_pgentry_t *l3e; +#endif + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + + ASSERT(current == v); + if ( !shadow_vcpu_mode_translate(v) ) + return _mfn(gfn); + +#if CONFIG_PAGING_LEVELS > 2 + if ( gfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) + /* This pfn is higher than the p2m map can hold */ + return _mfn(INVALID_MFN); +#endif + + /* Walk the linear pagetables. Note that this is *not* the same as + * the walk in sh_gfn_to_mfn_foreign, which is walking the p2m map */ +#if CONFIG_PAGING_LEVELS >= 4 + l4e = __linear_l4_table + l4_linear_offset(entry_addr); + if ( !(l4e_get_flags(*l4e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); + l3e = __linear_l3_table + l3_linear_offset(entry_addr); + if ( !(l3e_get_flags(*l3e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); +#endif + l2e = __linear_l2_table + l2_linear_offset(entry_addr); + if ( !(l2e_get_flags(*l2e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); + l1e = __linear_l1_table + l1_linear_offset(entry_addr); + if ( !(l1e_get_flags(*l1e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); + + /* Safe to look at this part of the table */ + if ( l1e_get_flags(phys_to_machine_mapping[gfn]) & _PAGE_PRESENT ) + return _mfn(l1e_get_pfn(phys_to_machine_mapping[gfn])); + + return _mfn(INVALID_MFN); +} + + +#endif /* _XEN_SHADOW_PRIVATE_H */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/arch/x86/mm/shadow/types.h b/xen/arch/x86/mm/shadow/types.h new file mode 100644 index 0000000000..bf1b2ce763 --- /dev/null +++ b/xen/arch/x86/mm/shadow/types.h @@ -0,0 +1,692 @@ +/****************************************************************************** + * arch/x86/mm/shadow/types.h + * + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _XEN_SHADOW_TYPES_H +#define _XEN_SHADOW_TYPES_H + +// Map a shadow page +static inline void * +map_shadow_page(mfn_t smfn) +{ + // XXX -- Possible optimization/measurement question for 32-bit and PAE + // hypervisors: + // How often is this smfn already available in the shadow linear + // table? Might it be worth checking that table first, + // presumably using the reverse map hint in the page_info of this + // smfn, rather than calling map_domain_page()? + // + return sh_map_domain_page(smfn); +} + +// matching unmap for map_shadow_page() +static inline void +unmap_shadow_page(void *p) +{ + sh_unmap_domain_page(p); +} + +/* + * Define various types for handling pagetabels, based on these options: + * SHADOW_PAGING_LEVELS : Number of levels of shadow pagetables + * GUEST_PAGING_LEVELS : Number of levels of guest pagetables + */ + +#if (CONFIG_PAGING_LEVELS < SHADOW_PAGING_LEVELS) +#error Cannot have more levels of shadow pagetables than host pagetables +#endif + +#if (SHADOW_PAGING_LEVELS < GUEST_PAGING_LEVELS) +#error Cannot have more levels of guest pagetables than shadow pagetables +#endif + +#if SHADOW_PAGING_LEVELS == 2 +#define SHADOW_L1_PAGETABLE_ENTRIES 1024 +#define SHADOW_L2_PAGETABLE_ENTRIES 1024 +#define SHADOW_L1_PAGETABLE_SHIFT 12 +#define SHADOW_L2_PAGETABLE_SHIFT 22 +#endif + +#if SHADOW_PAGING_LEVELS == 3 +#define SHADOW_L1_PAGETABLE_ENTRIES 512 +#define SHADOW_L2_PAGETABLE_ENTRIES 512 +#define SHADOW_L3_PAGETABLE_ENTRIES 4 +#define SHADOW_L1_PAGETABLE_SHIFT 12 +#define SHADOW_L2_PAGETABLE_SHIFT 21 +#define SHADOW_L3_PAGETABLE_SHIFT 30 +#endif + +#if SHADOW_PAGING_LEVELS == 4 +#define SHADOW_L1_PAGETABLE_ENTRIES 512 +#define SHADOW_L2_PAGETABLE_ENTRIES 512 +#define SHADOW_L3_PAGETABLE_ENTRIES 512 +#define SHADOW_L4_PAGETABLE_ENTRIES 512 +#define SHADOW_L1_PAGETABLE_SHIFT 12 +#define SHADOW_L2_PAGETABLE_SHIFT 21 +#define SHADOW_L3_PAGETABLE_SHIFT 30 +#define SHADOW_L4_PAGETABLE_SHIFT 39 +#endif + +/* Types of the shadow page tables */ +typedef l1_pgentry_t shadow_l1e_t; +typedef l2_pgentry_t shadow_l2e_t; +#if SHADOW_PAGING_LEVELS >= 3 +typedef l3_pgentry_t shadow_l3e_t; +#if SHADOW_PAGING_LEVELS >= 4 +typedef l4_pgentry_t shadow_l4e_t; +#endif +#endif + +/* Access functions for them */ +static inline paddr_t shadow_l1e_get_paddr(shadow_l1e_t sl1e) +{ return l1e_get_paddr(sl1e); } +static inline paddr_t shadow_l2e_get_paddr(shadow_l2e_t sl2e) +{ return l2e_get_paddr(sl2e); } +#if SHADOW_PAGING_LEVELS >= 3 +static inline paddr_t shadow_l3e_get_paddr(shadow_l3e_t sl3e) +{ return l3e_get_paddr(sl3e); } +#if SHADOW_PAGING_LEVELS >= 4 +static inline paddr_t shadow_l4e_get_paddr(shadow_l4e_t sl4e) +{ return l4e_get_paddr(sl4e); } +#endif +#endif + +static inline mfn_t shadow_l1e_get_mfn(shadow_l1e_t sl1e) +{ return _mfn(l1e_get_pfn(sl1e)); } +static inline mfn_t shadow_l2e_get_mfn(shadow_l2e_t sl2e) +{ return _mfn(l2e_get_pfn(sl2e)); } +#if SHADOW_PAGING_LEVELS >= 3 +static inline mfn_t shadow_l3e_get_mfn(shadow_l3e_t sl3e) +{ return _mfn(l3e_get_pfn(sl3e)); } +#if SHADOW_PAGING_LEVELS >= 4 +static inline mfn_t shadow_l4e_get_mfn(shadow_l4e_t sl4e) +{ return _mfn(l4e_get_pfn(sl4e)); } +#endif +#endif + +static inline u32 shadow_l1e_get_flags(shadow_l1e_t sl1e) +{ return l1e_get_flags(sl1e); } +static inline u32 shadow_l2e_get_flags(shadow_l2e_t sl2e) +{ return l2e_get_flags(sl2e); } +#if SHADOW_PAGING_LEVELS >= 3 +static inline u32 shadow_l3e_get_flags(shadow_l3e_t sl3e) +{ return l3e_get_flags(sl3e); } +#if SHADOW_PAGING_LEVELS >= 4 +static inline u32 shadow_l4e_get_flags(shadow_l4e_t sl4e) +{ return l4e_get_flags(sl4e); } +#endif +#endif + +static inline shadow_l1e_t +shadow_l1e_remove_flags(shadow_l1e_t sl1e, u32 flags) +{ l1e_remove_flags(sl1e, flags); return sl1e; } + +static inline shadow_l1e_t shadow_l1e_empty(void) +{ return l1e_empty(); } +static inline shadow_l2e_t shadow_l2e_empty(void) +{ return l2e_empty(); } +#if SHADOW_PAGING_LEVELS >= 3 +static inline shadow_l3e_t shadow_l3e_empty(void) +{ return l3e_empty(); } +#if SHADOW_PAGING_LEVELS >= 4 +static inline shadow_l4e_t shadow_l4e_empty(void) +{ return l4e_empty(); } +#endif +#endif + +static inline shadow_l1e_t shadow_l1e_from_mfn(mfn_t mfn, u32 flags) +{ return l1e_from_pfn(mfn_x(mfn), flags); } +static inline shadow_l2e_t shadow_l2e_from_mfn(mfn_t mfn, u32 flags) +{ return l2e_from_pfn(mfn_x(mfn), flags); } +#if SHADOW_PAGING_LEVELS >= 3 +static inline shadow_l3e_t shadow_l3e_from_mfn(mfn_t mfn, u32 flags) +{ return l3e_from_pfn(mfn_x(mfn), flags); } +#if SHADOW_PAGING_LEVELS >= 4 +static inline shadow_l4e_t shadow_l4e_from_mfn(mfn_t mfn, u32 flags) +{ return l4e_from_pfn(mfn_x(mfn), flags); } +#endif +#endif + +#define shadow_l1_table_offset(a) l1_table_offset(a) +#define shadow_l2_table_offset(a) l2_table_offset(a) +#define shadow_l3_table_offset(a) l3_table_offset(a) +#define shadow_l4_table_offset(a) l4_table_offset(a) + +/**************************************************************************/ +/* Access to the linear mapping of shadow page tables. */ + +/* Offsets into each level of the linear mapping for a virtual address. */ +#define shadow_l1_linear_offset(_a) \ + (((_a) & VADDR_MASK) >> SHADOW_L1_PAGETABLE_SHIFT) +#define shadow_l2_linear_offset(_a) \ + (((_a) & VADDR_MASK) >> SHADOW_L2_PAGETABLE_SHIFT) +#define shadow_l3_linear_offset(_a) \ + (((_a) & VADDR_MASK) >> SHADOW_L3_PAGETABLE_SHIFT) +#define shadow_l4_linear_offset(_a) \ + (((_a) & VADDR_MASK) >> SHADOW_L4_PAGETABLE_SHIFT) + +/* Where to find each level of the linear mapping. For PV guests, we use + * the shadow linear-map self-entry as many times as we need. For HVM + * guests, the shadow doesn't have a linear-map self-entry so we must use + * the monitor-table's linear-map entry N-1 times and then the shadow-map + * entry once. */ +#define __sh_linear_l1_table ((shadow_l1e_t *)(SH_LINEAR_PT_VIRT_START)) +#define __sh_linear_l2_table ((shadow_l2e_t *) \ + (__sh_linear_l1_table + shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START))) + +// shadow linear L3 and L4 tables only exist in 4 level paging... +#if SHADOW_PAGING_LEVELS == 4 +#define __sh_linear_l3_table ((shadow_l3e_t *) \ + (__sh_linear_l2_table + shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START))) +#define __sh_linear_l4_table ((shadow_l4e_t *) \ + (__sh_linear_l3_table + shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START))) +#endif + +#define sh_linear_l1_table(v) ({ \ + ASSERT(current == (v)); \ + __sh_linear_l1_table; \ +}) + +#define sh_linear_l2_table(v) ({ \ + ASSERT(current == (v)); \ + ((shadow_l2e_t *) \ + (hvm_guest(v) ? __linear_l1_table : __sh_linear_l1_table) + \ + shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START)); \ +}) + +// shadow linear L3 and L4 tables only exist in 4 level paging... +#if SHADOW_PAGING_LEVELS == 4 +#define sh_linear_l3_table(v) ({ \ + ASSERT(current == (v)); \ + ((shadow_l3e_t *) \ + (hvm_guest(v) ? __linear_l2_table : __sh_linear_l2_table) + \ + shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START)); \ +}) + +// we use l4_pgentry_t instead of shadow_l4e_t below because shadow_l4e_t is +// not defined for when xen_levels==4 & shadow_levels==3... +#define sh_linear_l4_table(v) ({ \ + ASSERT(current == (v)); \ + ((l4_pgentry_t *) \ + (hvm_guest(v) ? __linear_l3_table : __sh_linear_l3_table) + \ + shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START)); \ +}) +#endif + +#if GUEST_PAGING_LEVELS == 2 + +#include "page-guest32.h" + +#define GUEST_L1_PAGETABLE_ENTRIES 1024 +#define GUEST_L2_PAGETABLE_ENTRIES 1024 +#define GUEST_L1_PAGETABLE_SHIFT 12 +#define GUEST_L2_PAGETABLE_SHIFT 22 + +/* Type of the guest's frame numbers */ +TYPE_SAFE(u32,gfn) +#define INVALID_GFN ((u32)(-1u)) +#define SH_PRI_gfn "05x" + +/* Types of the guest's page tables */ +typedef l1_pgentry_32_t guest_l1e_t; +typedef l2_pgentry_32_t guest_l2e_t; + +/* Access functions for them */ +static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e) +{ return l1e_get_paddr_32(gl1e); } +static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e) +{ return l2e_get_paddr_32(gl2e); } + +static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e) +{ return _gfn(l1e_get_paddr_32(gl1e) >> PAGE_SHIFT); } +static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e) +{ return _gfn(l2e_get_paddr_32(gl2e) >> PAGE_SHIFT); } + +static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e) +{ return l1e_get_flags_32(gl1e); } +static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e) +{ return l2e_get_flags_32(gl2e); } + +static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags) +{ l1e_add_flags_32(gl1e, flags); return gl1e; } +static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags) +{ l2e_add_flags_32(gl2e, flags); return gl2e; } + +static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags) +{ return l1e_from_pfn_32(gfn_x(gfn), flags); } +static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags) +{ return l2e_from_pfn_32(gfn_x(gfn), flags); } + +#define guest_l1_table_offset(a) l1_table_offset_32(a) +#define guest_l2_table_offset(a) l2_table_offset_32(a) + +/* The shadow types needed for the various levels. */ +#define PGC_SH_l1_shadow PGC_SH_l1_32_shadow +#define PGC_SH_l2_shadow PGC_SH_l2_32_shadow +#define PGC_SH_fl1_shadow PGC_SH_fl1_32_shadow + +#else /* GUEST_PAGING_LEVELS != 2 */ + +#if GUEST_PAGING_LEVELS == 3 +#define GUEST_L1_PAGETABLE_ENTRIES 512 +#define GUEST_L2_PAGETABLE_ENTRIES 512 +#define GUEST_L3_PAGETABLE_ENTRIES 4 +#define GUEST_L1_PAGETABLE_SHIFT 12 +#define GUEST_L2_PAGETABLE_SHIFT 21 +#define GUEST_L3_PAGETABLE_SHIFT 30 +#else /* GUEST_PAGING_LEVELS == 4 */ +#define GUEST_L1_PAGETABLE_ENTRIES 512 +#define GUEST_L2_PAGETABLE_ENTRIES 512 +#define GUEST_L3_PAGETABLE_ENTRIES 512 +#define GUEST_L4_PAGETABLE_ENTRIES 512 +#define GUEST_L1_PAGETABLE_SHIFT 12 +#define GUEST_L2_PAGETABLE_SHIFT 21 +#define GUEST_L3_PAGETABLE_SHIFT 30 +#define GUEST_L4_PAGETABLE_SHIFT 39 +#endif + +/* Type of the guest's frame numbers */ +TYPE_SAFE(unsigned long,gfn) +#define INVALID_GFN ((unsigned long)(-1ul)) +#define SH_PRI_gfn "05lx" + +/* Types of the guest's page tables */ +typedef l1_pgentry_t guest_l1e_t; +typedef l2_pgentry_t guest_l2e_t; +typedef l3_pgentry_t guest_l3e_t; +#if GUEST_PAGING_LEVELS >= 4 +typedef l4_pgentry_t guest_l4e_t; +#endif + +/* Access functions for them */ +static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e) +{ return l1e_get_paddr(gl1e); } +static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e) +{ return l2e_get_paddr(gl2e); } +static inline paddr_t guest_l3e_get_paddr(guest_l3e_t gl3e) +{ return l3e_get_paddr(gl3e); } +#if GUEST_PAGING_LEVELS >= 4 +static inline paddr_t guest_l4e_get_paddr(guest_l4e_t gl4e) +{ return l4e_get_paddr(gl4e); } +#endif + +static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e) +{ return _gfn(l1e_get_paddr(gl1e) >> PAGE_SHIFT); } +static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e) +{ return _gfn(l2e_get_paddr(gl2e) >> PAGE_SHIFT); } +static inline gfn_t guest_l3e_get_gfn(guest_l3e_t gl3e) +{ return _gfn(l3e_get_paddr(gl3e) >> PAGE_SHIFT); } +#if GUEST_PAGING_LEVELS >= 4 +static inline gfn_t guest_l4e_get_gfn(guest_l4e_t gl4e) +{ return _gfn(l4e_get_paddr(gl4e) >> PAGE_SHIFT); } +#endif + +static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e) +{ return l1e_get_flags(gl1e); } +static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e) +{ return l2e_get_flags(gl2e); } +static inline u32 guest_l3e_get_flags(guest_l3e_t gl3e) +{ return l3e_get_flags(gl3e); } +#if GUEST_PAGING_LEVELS >= 4 +static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e) +{ return l4e_get_flags(gl4e); } +#endif + +static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags) +{ l1e_add_flags(gl1e, flags); return gl1e; } +static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags) +{ l2e_add_flags(gl2e, flags); return gl2e; } +static inline guest_l3e_t guest_l3e_add_flags(guest_l3e_t gl3e, u32 flags) +{ l3e_add_flags(gl3e, flags); return gl3e; } +#if GUEST_PAGING_LEVELS >= 4 +static inline guest_l4e_t guest_l4e_add_flags(guest_l4e_t gl4e, u32 flags) +{ l4e_add_flags(gl4e, flags); return gl4e; } +#endif + +static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags) +{ return l1e_from_pfn(gfn_x(gfn), flags); } +static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags) +{ return l2e_from_pfn(gfn_x(gfn), flags); } +static inline guest_l3e_t guest_l3e_from_gfn(gfn_t gfn, u32 flags) +{ return l3e_from_pfn(gfn_x(gfn), flags); } +#if GUEST_PAGING_LEVELS >= 4 +static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags) +{ return l4e_from_pfn(gfn_x(gfn), flags); } +#endif + +#define guest_l1_table_offset(a) l1_table_offset(a) +#define guest_l2_table_offset(a) l2_table_offset(a) +#define guest_l3_table_offset(a) l3_table_offset(a) +#define guest_l4_table_offset(a) l4_table_offset(a) + +/* The shadow types needed for the various levels. */ +#if GUEST_PAGING_LEVELS == 3 +#define PGC_SH_l1_shadow PGC_SH_l1_pae_shadow +#define PGC_SH_fl1_shadow PGC_SH_fl1_pae_shadow +#define PGC_SH_l2_shadow PGC_SH_l2_pae_shadow +#define PGC_SH_l2h_shadow PGC_SH_l2h_pae_shadow +#define PGC_SH_l3_shadow PGC_SH_l3_pae_shadow +#else +#define PGC_SH_l1_shadow PGC_SH_l1_64_shadow +#define PGC_SH_fl1_shadow PGC_SH_fl1_64_shadow +#define PGC_SH_l2_shadow PGC_SH_l2_64_shadow +#define PGC_SH_l3_shadow PGC_SH_l3_64_shadow +#define PGC_SH_l4_shadow PGC_SH_l4_64_shadow +#endif + +#endif /* GUEST_PAGING_LEVELS != 2 */ + +#define VALID_GFN(m) (m != INVALID_GFN) + +static inline int +valid_gfn(gfn_t m) +{ + return VALID_GFN(gfn_x(m)); +} + +#if GUEST_PAGING_LEVELS == 2 +#define PGC_SH_guest_root_type PGC_SH_l2_32_shadow +#elif GUEST_PAGING_LEVELS == 3 +#define PGC_SH_guest_root_type PGC_SH_l3_pae_shadow +#else +#define PGC_SH_guest_root_type PGC_SH_l4_64_shadow +#endif + +/* Translation between mfns and gfns */ +static inline mfn_t +vcpu_gfn_to_mfn(struct vcpu *v, gfn_t gfn) +{ + return sh_vcpu_gfn_to_mfn(v, gfn_x(gfn)); +} + +static inline gfn_t +mfn_to_gfn(struct domain *d, mfn_t mfn) +{ + return _gfn(sh_mfn_to_gfn(d, mfn)); +} + +static inline paddr_t +gfn_to_paddr(gfn_t gfn) +{ + return ((paddr_t)gfn_x(gfn)) << PAGE_SHIFT; +} + +/* Type used for recording a walk through guest pagetables. It is + * filled in by the pagetable walk function, and also used as a cache + * for later walks. + * Any non-null pointer in this structure represents a mapping of guest + * memory. We must always call walk_init() before using a walk_t, and + * call walk_unmap() when we're done. + * The "Effective l1e" field is used when there isn't an l1e to point to, + * but we have fabricated an l1e for propagation to the shadow (e.g., + * for splintering guest superpages into many shadow l1 entries). */ +typedef struct shadow_walk_t walk_t; +struct shadow_walk_t +{ + unsigned long va; /* Address we were looking for */ +#if GUEST_PAGING_LEVELS >= 3 +#if GUEST_PAGING_LEVELS >= 4 + guest_l4e_t *l4e; /* Pointer to guest's level 4 entry */ +#endif + guest_l3e_t *l3e; /* Pointer to guest's level 3 entry */ +#endif + guest_l2e_t *l2e; /* Pointer to guest's level 2 entry */ + guest_l1e_t *l1e; /* Pointer to guest's level 1 entry */ + guest_l1e_t eff_l1e; /* Effective level 1 entry */ +#if GUEST_PAGING_LEVELS >= 3 +#if GUEST_PAGING_LEVELS >= 4 + mfn_t l4mfn; /* MFN that the level 4 entry is in */ +#endif + mfn_t l3mfn; /* MFN that the level 3 entry is in */ +#endif + mfn_t l2mfn; /* MFN that the level 2 entry is in */ + mfn_t l1mfn; /* MFN that the level 1 entry is in */ +}; + +/* macros for dealing with the naming of the internal function names of the + * shadow code's external entry points. + */ +#define INTERNAL_NAME(name) \ + SHADOW_INTERNAL_NAME(name, SHADOW_PAGING_LEVELS, GUEST_PAGING_LEVELS) + +/* macros for renaming the primary entry points, so that they are more + * easily distinguished from a debugger + */ +#define sh_page_fault INTERNAL_NAME(sh_page_fault) +#define sh_invlpg INTERNAL_NAME(sh_invlpg) +#define sh_gva_to_gpa INTERNAL_NAME(sh_gva_to_gpa) +#define sh_gva_to_gfn INTERNAL_NAME(sh_gva_to_gfn) +#define sh_update_cr3 INTERNAL_NAME(sh_update_cr3) +#define sh_remove_write_access INTERNAL_NAME(sh_remove_write_access) +#define sh_remove_all_mappings INTERNAL_NAME(sh_remove_all_mappings) +#define sh_remove_l1_shadow INTERNAL_NAME(sh_remove_l1_shadow) +#define sh_remove_l2_shadow INTERNAL_NAME(sh_remove_l2_shadow) +#define sh_remove_l3_shadow INTERNAL_NAME(sh_remove_l3_shadow) +#define sh_map_and_validate_gl4e INTERNAL_NAME(sh_map_and_validate_gl4e) +#define sh_map_and_validate_gl3e INTERNAL_NAME(sh_map_and_validate_gl3e) +#define sh_map_and_validate_gl2e INTERNAL_NAME(sh_map_and_validate_gl2e) +#define sh_map_and_validate_gl2he INTERNAL_NAME(sh_map_and_validate_gl2he) +#define sh_map_and_validate_gl1e INTERNAL_NAME(sh_map_and_validate_gl1e) +#define sh_destroy_l4_shadow INTERNAL_NAME(sh_destroy_l4_shadow) +#define sh_destroy_l3_shadow INTERNAL_NAME(sh_destroy_l3_shadow) +#define sh_destroy_l3_subshadow INTERNAL_NAME(sh_destroy_l3_subshadow) +#define sh_unpin_all_l3_subshadows INTERNAL_NAME(sh_unpin_all_l3_subshadows) +#define sh_destroy_l2_shadow INTERNAL_NAME(sh_destroy_l2_shadow) +#define sh_destroy_l1_shadow INTERNAL_NAME(sh_destroy_l1_shadow) +#define sh_unhook_32b_mappings INTERNAL_NAME(sh_unhook_32b_mappings) +#define sh_unhook_pae_mappings INTERNAL_NAME(sh_unhook_pae_mappings) +#define sh_unhook_64b_mappings INTERNAL_NAME(sh_unhook_64b_mappings) +#define sh_paging_mode INTERNAL_NAME(sh_paging_mode) +#define sh_detach_old_tables INTERNAL_NAME(sh_detach_old_tables) +#define sh_x86_emulate_write INTERNAL_NAME(sh_x86_emulate_write) +#define sh_x86_emulate_cmpxchg INTERNAL_NAME(sh_x86_emulate_cmpxchg) +#define sh_x86_emulate_cmpxchg8b INTERNAL_NAME(sh_x86_emulate_cmpxchg8b) +#define sh_audit_l1_table INTERNAL_NAME(sh_audit_l1_table) +#define sh_audit_fl1_table INTERNAL_NAME(sh_audit_fl1_table) +#define sh_audit_l2_table INTERNAL_NAME(sh_audit_l2_table) +#define sh_audit_l3_table INTERNAL_NAME(sh_audit_l3_table) +#define sh_audit_l4_table INTERNAL_NAME(sh_audit_l4_table) +#define sh_guess_wrmap INTERNAL_NAME(sh_guess_wrmap) +#define sh_clear_shadow_entry INTERNAL_NAME(sh_clear_shadow_entry) + +/* sh_make_monitor_table only depends on the number of shadow levels */ +#define sh_make_monitor_table \ + SHADOW_INTERNAL_NAME(sh_make_monitor_table, \ + SHADOW_PAGING_LEVELS, \ + SHADOW_PAGING_LEVELS) +#define sh_destroy_monitor_table \ + SHADOW_INTERNAL_NAME(sh_destroy_monitor_table, \ + SHADOW_PAGING_LEVELS, \ + SHADOW_PAGING_LEVELS) + + +#if GUEST_PAGING_LEVELS == 3 +/* + * Accounting information stored in the shadow of PAE Guest L3 pages. + * Because these "L3 pages" are only 32-bytes, it is inconvenient to keep + * various refcounts, etc., on the page_info of their page. We provide extra + * bookkeeping space in the shadow itself, and this is the structure + * definition for that bookkeeping information. + */ +struct pae_l3_bookkeeping { + u32 vcpus; /* bitmap of which vcpus are currently storing + * copies of this 32-byte page */ + u32 refcount; /* refcount for this 32-byte page */ + u8 pinned; /* is this 32-byte page pinned or not? */ +}; + +// Convert a shadow entry pointer into a pae_l3_bookkeeping pointer. +#define sl3p_to_info(_ptr) ((struct pae_l3_bookkeeping *) \ + (((unsigned long)(_ptr) & ~31) + 32)) + +static void sh_destroy_l3_subshadow(struct vcpu *v, + shadow_l3e_t *sl3e); + +/* Increment a subshadow ref + * Called with a pointer to the subshadow, and the mfn of the + * *first* page of the overall shadow. */ +static inline void sh_get_ref_l3_subshadow(shadow_l3e_t *sl3e, mfn_t smfn) +{ + struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e); + + /* First ref to the subshadow takes a ref to the full shadow */ + if ( bk->refcount == 0 ) + sh_get_ref(smfn, 0); + if ( unlikely(++(bk->refcount) == 0) ) + { + SHADOW_PRINTK("shadow l3 subshadow ref overflow, smfn=%" SH_PRI_mfn " sh=%p\n", + mfn_x(smfn), sl3e); + domain_crash_synchronous(); + } +} + +/* Decrement a subshadow ref. + * Called with a pointer to the subshadow, and the mfn of the + * *first* page of the overall shadow. Calling this may cause the + * entire shadow to disappear, so the caller must immediately unmap + * the pointer after calling. */ +static inline void sh_put_ref_l3_subshadow(struct vcpu *v, + shadow_l3e_t *sl3e, + mfn_t smfn) +{ + struct pae_l3_bookkeeping *bk; + + bk = sl3p_to_info(sl3e); + + ASSERT(bk->refcount > 0); + if ( --(bk->refcount) == 0 ) + { + /* Need to destroy this subshadow */ + sh_destroy_l3_subshadow(v, sl3e); + /* Last ref to the subshadow had a ref to the full shadow */ + sh_put_ref(v, smfn, 0); + } +} + +/* Pin a subshadow + * Called with a pointer to the subshadow, and the mfn of the + * *first* page of the overall shadow. */ +static inline void sh_pin_l3_subshadow(shadow_l3e_t *sl3e, mfn_t smfn) +{ + struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e); + +#if 0 + debugtrace_printk("%s smfn=%05lx offset=%ld\n", + __func__, mfn_x(smfn), + ((unsigned long)sl3e & ~PAGE_MASK) / 64); +#endif + + if ( !bk->pinned ) + { + bk->pinned = 1; + sh_get_ref_l3_subshadow(sl3e, smfn); + } +} + +/* Unpin a sub-shadow. + * Called with a pointer to the subshadow, and the mfn of the + * *first* page of the overall shadow. Calling this may cause the + * entire shadow to disappear, so the caller must immediately unmap + * the pointer after calling. */ +static inline void sh_unpin_l3_subshadow(struct vcpu *v, + shadow_l3e_t *sl3e, + mfn_t smfn) +{ + struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e); + +#if 0 + debugtrace_printk("%s smfn=%05lx offset=%ld\n", + __func__, mfn_x(smfn), + ((unsigned long)sl3e & ~PAGE_MASK) / 64); +#endif + + if ( bk->pinned ) + { + bk->pinned = 0; + sh_put_ref_l3_subshadow(v, sl3e, smfn); + } +} + +#endif /* GUEST_PAGING_LEVELS == 3 */ + +#if SHADOW_PAGING_LEVELS == 3 +#define MFN_FITS_IN_HVM_CR3(_MFN) !(mfn_x(_MFN) >> 20) +#endif + +#if SHADOW_PAGING_LEVELS == 2 +#define SH_PRI_pte "08x" +#else /* SHADOW_PAGING_LEVELS >= 3 */ +#ifndef __x86_64__ +#define SH_PRI_pte "016llx" +#else +#define SH_PRI_pte "016lx" +#endif +#endif /* SHADOW_PAGING_LEVELS >= 3 */ + +#if GUEST_PAGING_LEVELS == 2 +#define SH_PRI_gpte "08x" +#else /* GUEST_PAGING_LEVELS >= 3 */ +#ifndef __x86_64__ +#define SH_PRI_gpte "016llx" +#else +#define SH_PRI_gpte "016lx" +#endif +#endif /* GUEST_PAGING_LEVELS >= 3 */ + +static inline u32 +accumulate_guest_flags(walk_t *gw) +{ + u32 accumulated_flags; + + // We accumulate the permission flags with bitwise ANDing. + // This works for the PRESENT bit, RW bit, and USER bit. + // For the NX bit, however, the polarity is wrong, so we accumulate the + // inverse of the NX bit. + // + accumulated_flags = guest_l1e_get_flags(gw->eff_l1e) ^ _PAGE_NX_BIT; + accumulated_flags &= guest_l2e_get_flags(*gw->l2e) ^ _PAGE_NX_BIT; + + // Note that PAE guests do not have USER or RW or NX bits in their L3s. + // +#if GUEST_PAGING_LEVELS == 3 + accumulated_flags &= + ~_PAGE_PRESENT | (guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT); +#elif GUEST_PAGING_LEVELS >= 4 + accumulated_flags &= guest_l3e_get_flags(*gw->l3e) ^ _PAGE_NX_BIT; + accumulated_flags &= guest_l4e_get_flags(*gw->l4e) ^ _PAGE_NX_BIT; +#endif + + // Finally, revert the NX bit back to its original polarity + accumulated_flags ^= _PAGE_NX_BIT; + + return accumulated_flags; +} + +#endif /* _XEN_SHADOW_TYPES_H */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/arch/x86/shadow2-common.c b/xen/arch/x86/shadow2-common.c deleted file mode 100644 index bdb7c38e87..0000000000 --- a/xen/arch/x86/shadow2-common.c +++ /dev/null @@ -1,3407 +0,0 @@ -/****************************************************************************** - * arch/x86/shadow2-common.c - * - * Shadow2 code that does not need to be multiply compiled. - * Parts of this code are Copyright (c) 2006 by XenSource Inc. - * Parts of this code are Copyright (c) 2006 by Michael A Fetterman - * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#define SHADOW2 1 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if SHADOW2_AUDIT -int shadow2_audit_enable = 0; - -static void shadow2_audit_key(unsigned char key) -{ - shadow2_audit_enable = !shadow2_audit_enable; - printk("%s shadow2_audit_enable=%d\n", - __func__, shadow2_audit_enable); -} - -static int __init shadow2_audit_key_init(void) -{ - register_keyhandler( - 'O', shadow2_audit_key, "toggle shadow2 audits"); - return 0; -} -__initcall(shadow2_audit_key_init); -#endif /* SHADOW2_AUDIT */ - -static void sh2_free_log_dirty_bitmap(struct domain *d); - -int _shadow2_mode_refcounts(struct domain *d) -{ - return shadow2_mode_refcounts(d); -} - - -/**************************************************************************/ -/* x86 emulator support for the shadow2 code - */ - -static int -sh2_x86_emulate_read_std(unsigned long addr, - unsigned long *val, - unsigned int bytes, - struct x86_emulate_ctxt *ctxt) -{ - struct vcpu *v = current; - if ( hvm_guest(v) ) - { - *val = 0; - // XXX -- this is WRONG. - // It entirely ignores the permissions in the page tables. - // In this case, that is only a user vs supervisor access check. - // - if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) ) - { -#if 0 - SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", - v->domain->domain_id, v->vcpu_id, - addr, *val, bytes); -#endif - return X86EMUL_CONTINUE; - } - - /* If we got here, there was nothing mapped here, or a bad GFN - * was mapped here. This should never happen: we're here because - * of a write fault at the end of the instruction we're emulating. */ - SHADOW2_PRINTK("read failed to va %#lx\n", addr); - return X86EMUL_PROPAGATE_FAULT; - } - else - { - SHADOW2_PRINTK("this operation is not emulated yet\n"); - return X86EMUL_UNHANDLEABLE; - } -} - -static int -sh2_x86_emulate_write_std(unsigned long addr, - unsigned long val, - unsigned int bytes, - struct x86_emulate_ctxt *ctxt) -{ - struct vcpu *v = current; -#if 0 - SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", - v->domain->domain_id, v->vcpu_id, addr, val, bytes); -#endif - if ( hvm_guest(v) ) - { - // XXX -- this is WRONG. - // It entirely ignores the permissions in the page tables. - // In this case, that includes user vs supervisor, and - // write access. - // - if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) ) - return X86EMUL_CONTINUE; - - /* If we got here, there was nothing mapped here, or a bad GFN - * was mapped here. This should never happen: we're here because - * of a write fault at the end of the instruction we're emulating, - * which should be handled by sh2_x86_emulate_write_emulated. */ - SHADOW2_PRINTK("write failed to va %#lx\n", addr); - return X86EMUL_PROPAGATE_FAULT; - } - else - { - SHADOW2_PRINTK("this operation is not emulated yet\n"); - return X86EMUL_UNHANDLEABLE; - } -} - -static int -sh2_x86_emulate_write_emulated(unsigned long addr, - unsigned long val, - unsigned int bytes, - struct x86_emulate_ctxt *ctxt) -{ - struct vcpu *v = current; -#if 0 - SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", - v->domain->domain_id, v->vcpu_id, addr, val, bytes); -#endif - if ( hvm_guest(v) ) - { - return v->arch.shadow2.mode->x86_emulate_write(v, addr, &val, bytes, ctxt); - } - else - { - SHADOW2_PRINTK("this operation is not emulated yet\n"); - return X86EMUL_UNHANDLEABLE; - } -} - -static int -sh2_x86_emulate_cmpxchg_emulated(unsigned long addr, - unsigned long old, - unsigned long new, - unsigned int bytes, - struct x86_emulate_ctxt *ctxt) -{ - struct vcpu *v = current; -#if 0 - SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n", - v->domain->domain_id, v->vcpu_id, addr, old, new, bytes); -#endif - if ( hvm_guest(v) ) - { - return v->arch.shadow2.mode->x86_emulate_cmpxchg(v, addr, old, new, - bytes, ctxt); - } - else - { - SHADOW2_PRINTK("this operation is not emulated yet\n"); - return X86EMUL_UNHANDLEABLE; - } -} - -static int -sh2_x86_emulate_cmpxchg8b_emulated(unsigned long addr, - unsigned long old_lo, - unsigned long old_hi, - unsigned long new_lo, - unsigned long new_hi, - struct x86_emulate_ctxt *ctxt) -{ - struct vcpu *v = current; -#if 0 - SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n", - v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo, - new_hi, new_lo, ctxt); -#endif - if ( hvm_guest(v) ) - { - return v->arch.shadow2.mode->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi, - new_lo, new_hi, ctxt); - } - else - { - SHADOW2_PRINTK("this operation is not emulated yet\n"); - return X86EMUL_UNHANDLEABLE; - } -} - - -struct x86_emulate_ops shadow2_emulator_ops = { - .read_std = sh2_x86_emulate_read_std, - .write_std = sh2_x86_emulate_write_std, - .read_emulated = sh2_x86_emulate_read_std, - .write_emulated = sh2_x86_emulate_write_emulated, - .cmpxchg_emulated = sh2_x86_emulate_cmpxchg_emulated, - .cmpxchg8b_emulated = sh2_x86_emulate_cmpxchg8b_emulated, -}; - - -/**************************************************************************/ -/* Code for "promoting" a guest page to the point where the shadow code is - * willing to let it be treated as a guest page table. This generally - * involves making sure there are no writable mappings available to the guest - * for this page. - */ -void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type) -{ - struct page_info *page = mfn_to_page(gmfn); - unsigned long type_info; - - ASSERT(valid_mfn(gmfn)); - - /* We should never try to promote a gmfn that has writeable mappings */ - ASSERT(shadow2_remove_write_access(v, gmfn, 0, 0) == 0); - - // Is the page already shadowed? - if ( !test_and_set_bit(_PGC_page_table, &page->count_info) ) - { - // No prior shadow exists... - - // Grab a type-ref. We don't really care if we are racing with another - // vcpu or not, or even what kind of type we get; we just want the type - // count to be > 0. - // - do { - type_info = - page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask); - } while ( !get_page_type(page, type_info) ); - - // Now that the type ref is non-zero, we can safely use the - // shadow2_flags. - // - page->shadow2_flags = 0; - } - - ASSERT(!test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags)); - set_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags); -} - -void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type) -{ - struct page_info *page = mfn_to_page(gmfn); - - ASSERT(test_bit(_PGC_page_table, &page->count_info)); - ASSERT(test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags)); - - clear_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags); - - if ( (page->shadow2_flags & SH2F_page_type_mask) == 0 ) - { - // release the extra type ref - put_page_type(page); - - // clear the is-a-page-table bit. - clear_bit(_PGC_page_table, &page->count_info); - } -} - -/**************************************************************************/ -/* Validate a pagetable change from the guest and update the shadows. - * Returns a bitmask of SHADOW2_SET_* flags. */ - -static int -__shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, - void *entry, u32 size) -{ - int result = 0; - struct page_info *page = mfn_to_page(gmfn); - - sh2_mark_dirty(v->domain, gmfn); - - // Determine which types of shadows are affected, and update each. - // - // Always validate L1s before L2s to prevent another cpu with a linear - // mapping of this gmfn from seeing a walk that results from - // using the new L2 value and the old L1 value. (It is OK for such a - // guest to see a walk that uses the old L2 value with the new L1 value, - // as hardware could behave this way if one level of the pagewalk occurs - // before the store, and the next level of the pagewalk occurs after the - // store. - // - // Ditto for L2s before L3s, etc. - // - - if ( !(page->count_info & PGC_page_table) ) - return 0; /* Not shadowed at all */ - -#if CONFIG_PAGING_LEVELS == 2 - if ( page->shadow2_flags & SH2F_L1_32 ) - result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 2, 2) - (v, gmfn, entry, size); -#else - if ( page->shadow2_flags & SH2F_L1_32 ) - result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 2) - (v, gmfn, entry, size); -#endif - -#if CONFIG_PAGING_LEVELS == 2 - if ( page->shadow2_flags & SH2F_L2_32 ) - result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 2, 2) - (v, gmfn, entry, size); -#else - if ( page->shadow2_flags & SH2F_L2_32 ) - result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 2) - (v, gmfn, entry, size); -#endif - -#if CONFIG_PAGING_LEVELS >= 3 - if ( page->shadow2_flags & SH2F_L1_PAE ) - result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 3) - (v, gmfn, entry, size); - if ( page->shadow2_flags & SH2F_L2_PAE ) - result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 3) - (v, gmfn, entry, size); - if ( page->shadow2_flags & SH2F_L2H_PAE ) - result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, 3, 3) - (v, gmfn, entry, size); - if ( page->shadow2_flags & SH2F_L3_PAE ) - result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 3, 3) - (v, gmfn, entry, size); -#else /* 32-bit non-PAE hypervisor does not support PAE guests */ - ASSERT((page->shadow2_flags & (SH2F_L3_PAE|SH2F_L2_PAE|SH2F_L1_PAE)) == 0); -#endif - -#if CONFIG_PAGING_LEVELS >= 4 - if ( page->shadow2_flags & SH2F_L1_64 ) - result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 4, 4) - (v, gmfn, entry, size); - if ( page->shadow2_flags & SH2F_L2_64 ) - result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 4, 4) - (v, gmfn, entry, size); - if ( page->shadow2_flags & SH2F_L3_64 ) - result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 4, 4) - (v, gmfn, entry, size); - if ( page->shadow2_flags & SH2F_L4_64 ) - result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, 4, 4) - (v, gmfn, entry, size); -#else /* 32-bit/PAE hypervisor does not support 64-bit guests */ - ASSERT((page->shadow2_flags - & (SH2F_L4_64|SH2F_L3_64|SH2F_L2_64|SH2F_L1_64)) == 0); -#endif - - return result; -} - - -int -shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry) -/* This is the entry point from hypercalls. It returns a bitmask of all the - * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */ -{ - int rc; - - ASSERT(shadow2_lock_is_acquired(v->domain)); - rc = __shadow2_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t)); - shadow2_audit_tables(v); - return rc; -} - -void -shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, - void *entry, u32 size) -/* This is the entry point for emulated writes to pagetables in HVM guests */ -{ - struct domain *d = v->domain; - int rc; - - ASSERT(shadow2_lock_is_acquired(v->domain)); - rc = __shadow2_validate_guest_entry(v, gmfn, entry, size); - if ( rc & SHADOW2_SET_FLUSH ) - { - // Flush everyone except the local processor, which will flush when it - // re-enters the HVM guest. - // - cpumask_t mask = d->domain_dirty_cpumask; - cpu_clear(v->processor, mask); - flush_tlb_mask(mask); - } - if ( rc & SHADOW2_SET_ERROR ) - { - /* This page is probably not a pagetable any more: tear it out of the - * shadows, along with any tables that reference it */ - shadow2_remove_all_shadows_and_parents(v, gmfn); - } - /* We ignore the other bits: since we are about to change CR3 on - * VMENTER we don't need to do any extra TLB flushes. */ -} - - -/**************************************************************************/ -/* Memory management for shadow pages. */ - -/* Meaning of the count_info field in shadow pages - * ---------------------------------------------- - * - * A count of all references to this page from other shadow pages and - * guest CR3s (a.k.a. v->arch.shadow2.table). - * - * The top bits hold the shadow type and the pinned bit. Top-level - * shadows are pinned so that they don't disappear when not in a CR3 - * somewhere. - * - * We don't need to use get|put_page for this as the updates are all - * protected by the shadow lock. We can't use get|put_page for this - * as the size of the count on shadow pages is different from that on - * normal guest pages. - */ - -/* Meaning of the type_info field in shadow pages - * ---------------------------------------------- - * - * type_info use depends on the shadow type (from count_info) - * - * PGC_SH2_none : This page is in the shadow2 free pool. type_info holds - * the chunk order for our freelist allocator. - * - * PGC_SH2_l*_shadow : This page is in use as a shadow. type_info - * holds the mfn of the guest page being shadowed, - * - * PGC_SH2_fl1_*_shadow : This page is being used to shatter a superpage. - * type_info holds the gfn being shattered. - * - * PGC_SH2_monitor_table : This page is part of a monitor table. - * type_info is not used. - */ - -/* Meaning of the _domain field in shadow pages - * -------------------------------------------- - * - * In shadow pages, this field will always have its least significant bit - * set. This ensures that all attempts to get_page() will fail (as all - * valid pickled domain pointers have a zero for their least significant bit). - * Instead, the remaining upper bits are used to record the shadow generation - * counter when the shadow was created. - */ - -/* Meaning of the shadow2_flags field - * ---------------------------------- - * - * In guest pages that are shadowed, one bit for each kind of shadow they have. - * - * In shadow pages, will be used for holding a representation of the populated - * entries in this shadow (either a min/max, or a bitmap, or ...) - * - * In monitor-table pages, holds the level of the particular page (to save - * spilling the shadow types into an extra bit by having three types of monitor - * page). - */ - -/* Meaning of the list_head struct in shadow pages - * ----------------------------------------------- - * - * In free shadow pages, this is used to hold the free-lists of chunks. - * - * In top-level shadow tables, this holds a linked-list of all top-level - * shadows (used for recovering memory and destroying shadows). - * - * In lower-level shadows, this holds the physical address of a higher-level - * shadow entry that holds a reference to this shadow (or zero). - */ - -/* Allocating shadow pages - * ----------------------- - * - * Most shadow pages are allocated singly, but there are two cases where we - * need to allocate multiple pages together. - * - * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows. - * A 32-bit guest l1 table covers 4MB of virtuial address space, - * and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB - * of virtual address space each). Similarly, a 32-bit guest l2 table - * (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va - * each). These multi-page shadows are contiguous and aligned; - * functions for handling offsets into them are defined in shadow2.c - * (shadow_l1_index() etc.) - * - * 2: Shadowing PAE top-level pages. Each guest page that contains - * any PAE top-level pages requires two shadow pages to shadow it. - * They contain alternating l3 tables and pae_l3_bookkeeping structs. - * - * This table shows the allocation behaviour of the different modes: - * - * Xen paging 32b pae pae 64b 64b 64b - * Guest paging 32b 32b pae 32b pae 64b - * PV or HVM * HVM * HVM HVM * - * Shadow paging 32b pae pae pae pae 64b - * - * sl1 size 4k 8k 4k 8k 4k 4k - * sl2 size 4k 16k 4k 16k 4k 4k - * sl3 size - - 8k - 8k 4k - * sl4 size - - - - - 4k - * - * We allocate memory from xen in four-page units and break them down - * with a simple buddy allocator. Can't use the xen allocator to handle - * this as it only works for contiguous zones, and a domain's shadow - * pool is made of fragments. - * - * In HVM guests, the p2m table is built out of shadow pages, and we provide - * a function for the p2m management to steal pages, in max-order chunks, from - * the free pool. We don't provide for giving them back, yet. - */ - -/* Figure out the least acceptable quantity of shadow memory. - * The minimum memory requirement for always being able to free up a - * chunk of memory is very small -- only three max-order chunks per - * vcpu to hold the top level shadows and pages with Xen mappings in them. - * - * But for a guest to be guaranteed to successfully execute a single - * instruction, we must be able to map a large number (about thirty) VAs - * at the same time, which means that to guarantee progress, we must - * allow for more than ninety allocated pages per vcpu. We round that - * up to 128 pages, or half a megabyte per vcpu. */ -unsigned int shadow2_min_acceptable_pages(struct domain *d) -{ - u32 vcpu_count = 0; - struct vcpu *v; - - for_each_vcpu(d, v) - vcpu_count++; - - return (vcpu_count * 128); -} - -/* Using the type_info field to store freelist order */ -#define SH2_PFN_ORDER(_p) ((_p)->u.inuse.type_info) -#define SH2_SET_PFN_ORDER(_p, _o) \ - do { (_p)->u.inuse.type_info = (_o); } while (0) - - -/* Figure out the order of allocation needed for a given shadow type */ -static inline u32 -shadow_order(u32 shadow_type) -{ -#if CONFIG_PAGING_LEVELS > 2 - static const u32 type_to_order[16] = { - 0, /* PGC_SH2_none */ - 1, /* PGC_SH2_l1_32_shadow */ - 1, /* PGC_SH2_fl1_32_shadow */ - 2, /* PGC_SH2_l2_32_shadow */ - 0, /* PGC_SH2_l1_pae_shadow */ - 0, /* PGC_SH2_fl1_pae_shadow */ - 0, /* PGC_SH2_l2_pae_shadow */ - 0, /* PGC_SH2_l2h_pae_shadow */ - 1, /* PGC_SH2_l3_pae_shadow */ - 0, /* PGC_SH2_l1_64_shadow */ - 0, /* PGC_SH2_fl1_64_shadow */ - 0, /* PGC_SH2_l2_64_shadow */ - 0, /* PGC_SH2_l3_64_shadow */ - 0, /* PGC_SH2_l4_64_shadow */ - 2, /* PGC_SH2_p2m_table */ - 0 /* PGC_SH2_monitor_table */ - }; - u32 type = (shadow_type & PGC_SH2_type_mask) >> PGC_SH2_type_shift; - return type_to_order[type]; -#else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */ - return 0; -#endif -} - - -/* Do we have a free chunk of at least this order? */ -static inline int chunk_is_available(struct domain *d, int order) -{ - int i; - - for ( i = order; i <= SHADOW2_MAX_ORDER; i++ ) - if ( !list_empty(&d->arch.shadow2.freelists[i]) ) - return 1; - return 0; -} - -/* Dispatcher function: call the per-mode function that will unhook the - * non-Xen mappings in this top-level shadow mfn */ -void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn) -{ - struct page_info *pg = mfn_to_page(smfn); - switch ( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift ) - { - case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift: -#if CONFIG_PAGING_LEVELS == 2 - SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,2,2)(v,smfn); -#else - SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,3,2)(v,smfn); -#endif - break; -#if CONFIG_PAGING_LEVELS >= 3 - case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift: - SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings,3,3)(v,smfn); - break; -#endif -#if CONFIG_PAGING_LEVELS >= 4 - case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift: - SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings,4,4)(v,smfn); - break; -#endif - default: - SHADOW2_PRINTK("top-level shadow has bad type %08lx\n", - (unsigned long)((pg->count_info & PGC_SH2_type_mask) - >> PGC_SH2_type_shift)); - BUG(); - } -} - - -/* Make sure there is at least one chunk of the required order available - * in the shadow page pool. This must be called before any calls to - * shadow2_alloc(). Since this will free existing shadows to make room, - * it must be called early enough to avoid freeing shadows that the - * caller is currently working on. */ -void shadow2_prealloc(struct domain *d, unsigned int order) -{ - /* Need a vpcu for calling unpins; for now, since we don't have - * per-vcpu shadows, any will do */ - struct vcpu *v = d->vcpu[0]; - struct list_head *l, *t; - struct page_info *pg; - mfn_t smfn; - - if ( chunk_is_available(d, order) ) return; - - /* Stage one: walk the list of top-level pages, unpinning them */ - perfc_incrc(shadow2_prealloc_1); - list_for_each_backwards_safe(l, t, &d->arch.shadow2.toplevel_shadows) - { - pg = list_entry(l, struct page_info, list); - smfn = page_to_mfn(pg); - -#if CONFIG_PAGING_LEVELS >= 3 - if ( (pg->count_info & PGC_SH2_type_mask) == PGC_SH2_l3_pae_shadow ) - { - /* For PAE, we need to unpin each subshadow on this shadow */ - SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn); - } - else -#endif /* 32-bit code always takes this branch */ - { - /* Unpin this top-level shadow */ - sh2_unpin(v, smfn); - } - - /* See if that freed up a chunk of appropriate size */ - if ( chunk_is_available(d, order) ) return; - } - - /* Stage two: all shadow pages are in use in hierarchies that are - * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen - * mappings. */ - perfc_incrc(shadow2_prealloc_2); - v = current; - if ( v->domain != d ) - v = d->vcpu[0]; - /* Walk the list from the tail: recently used toplevels have been pulled - * to the head */ - list_for_each_backwards_safe(l, t, &d->arch.shadow2.toplevel_shadows) - { - pg = list_entry(l, struct page_info, list); - smfn = page_to_mfn(pg); - shadow2_unhook_mappings(v, smfn); - - /* Need to flush TLB if we've altered our own tables */ - if ( !shadow2_mode_external(d) - && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) ) - local_flush_tlb(); - - /* See if that freed up a chunk of appropriate size */ - if ( chunk_is_available(d, order) ) return; - } - - /* Nothing more we can do: all remaining shadows are of pages that - * hold Xen mappings for some vcpu. This can never happen. */ - SHADOW2_PRINTK("Can't pre-allocate %i shadow pages!\n" - " shadow pages total = %u, free = %u, p2m=%u\n", - 1 << order, - d->arch.shadow2.total_pages, - d->arch.shadow2.free_pages, - d->arch.shadow2.p2m_pages); - BUG(); -} - - -/* Allocate another shadow's worth of (contiguous, aligned) pages, - * and fill in the type and backpointer fields of their page_infos. - * Never fails to allocate. */ -mfn_t shadow2_alloc(struct domain *d, - u32 shadow_type, - unsigned long backpointer) -{ - struct page_info *pg = NULL; - unsigned int order = shadow_order(shadow_type); - cpumask_t mask; - void *p; - int i; - - ASSERT(shadow2_lock_is_acquired(d)); - ASSERT(order <= SHADOW2_MAX_ORDER); - ASSERT(shadow_type != PGC_SH2_none); - perfc_incrc(shadow2_alloc); - - /* Find smallest order which can satisfy the request. */ - for ( i = order; i <= SHADOW2_MAX_ORDER; i++ ) - if ( !list_empty(&d->arch.shadow2.freelists[i]) ) - { - pg = list_entry(d->arch.shadow2.freelists[i].next, - struct page_info, list); - list_del(&pg->list); - - /* We may have to halve the chunk a number of times. */ - while ( i != order ) - { - i--; - SH2_SET_PFN_ORDER(pg, i); - list_add_tail(&pg->list, &d->arch.shadow2.freelists[i]); - pg += 1 << i; - } - d->arch.shadow2.free_pages -= 1 << order; - - /* Init page info fields and clear the pages */ - for ( i = 0; i < 1<domain_dirty_cpumask; - tlbflush_filter(mask, pg[i].tlbflush_timestamp); - if ( unlikely(!cpus_empty(mask)) ) - { - perfc_incrc(shadow2_alloc_tlbflush); - flush_tlb_mask(mask); - } - /* Now safe to clear the page for reuse */ - p = sh2_map_domain_page(page_to_mfn(pg+i)); - ASSERT(p != NULL); - clear_page(p); - sh2_unmap_domain_page(p); - perfc_incr(shadow2_alloc_count); - } - return page_to_mfn(pg); - } - - /* If we get here, we failed to allocate. This should never happen. - * It means that we didn't call shadow2_prealloc() correctly before - * we allocated. We can't recover by calling prealloc here, because - * we might free up higher-level pages that the caller is working on. */ - SHADOW2_PRINTK("Can't allocate %i shadow pages!\n", 1 << order); - BUG(); -} - - -/* Return some shadow pages to the pool. */ -void shadow2_free(struct domain *d, mfn_t smfn) -{ - struct page_info *pg = mfn_to_page(smfn); - u32 shadow_type; - unsigned long order; - unsigned long mask; - int i; - - ASSERT(shadow2_lock_is_acquired(d)); - perfc_incrc(shadow2_free); - - shadow_type = pg->count_info & PGC_SH2_type_mask; - ASSERT(shadow_type != PGC_SH2_none); - ASSERT(shadow_type != PGC_SH2_p2m_table); - order = shadow_order(shadow_type); - - d->arch.shadow2.free_pages += 1 << order; - - for ( i = 0; i < 1<count_info & PGC_SH2_type_mask) != PGT_none) - || (SH2_PFN_ORDER(pg-mask) != order) ) - break; - list_del(&(pg-mask)->list); - pg -= mask; - } else { - /* Merge with successor block? */ - if ( (((pg+mask)->count_info & PGC_SH2_type_mask) != PGT_none) - || (SH2_PFN_ORDER(pg+mask) != order) ) - break; - list_del(&(pg+mask)->list); - } - order++; - } - - SH2_SET_PFN_ORDER(pg, order); - list_add_tail(&pg->list, &d->arch.shadow2.freelists[order]); -} - -/* Divert some memory from the pool to be used by the p2m mapping. - * This action is irreversible: the p2m mapping only ever grows. - * That's OK because the p2m table only exists for external domains, - * and those domains can't ever turn off shadow mode. - * Also, we only ever allocate a max-order chunk, so as to preserve - * the invariant that shadow2_prealloc() always works. - * Returns 0 iff it can't get a chunk (the caller should then - * free up some pages in domheap and call set_sh2_allocation); - * returns non-zero on success. - */ -static int -shadow2_alloc_p2m_pages(struct domain *d) -{ - struct page_info *pg; - u32 i; - ASSERT(shadow2_lock_is_acquired(d)); - - if ( d->arch.shadow2.total_pages - < (shadow2_min_acceptable_pages(d) + (1<arch.shadow2.p2m_pages += (1<arch.shadow2.total_pages -= (1<arch.shadow2.p2m_freelist); - } - return 1; -} - -// Returns 0 if no memory is available... -mfn_t -shadow2_alloc_p2m_page(struct domain *d) -{ - struct list_head *entry; - mfn_t mfn; - void *p; - - if ( list_empty(&d->arch.shadow2.p2m_freelist) && - !shadow2_alloc_p2m_pages(d) ) - return _mfn(0); - entry = d->arch.shadow2.p2m_freelist.next; - list_del(entry); - list_add_tail(entry, &d->arch.shadow2.p2m_inuse); - mfn = page_to_mfn(list_entry(entry, struct page_info, list)); - sh2_get_ref(mfn, 0); - p = sh2_map_domain_page(mfn); - clear_page(p); - sh2_unmap_domain_page(p); - - return mfn; -} - -#if CONFIG_PAGING_LEVELS == 3 -static void p2m_install_entry_in_monitors(struct domain *d, - l3_pgentry_t *l3e) -/* Special case, only used for external-mode domains on PAE hosts: - * update the mapping of the p2m table. Once again, this is trivial in - * other paging modes (one top-level entry points to the top-level p2m, - * no maintenance needed), but PAE makes life difficult by needing a - * copy the eight l3es of the p2m table in eight l2h slots in the - * monitor table. This function makes fresh copies when a p2m l3e - * changes. */ -{ - l2_pgentry_t *ml2e; - struct vcpu *v; - unsigned int index; - - index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t); - ASSERT(index < MACHPHYS_MBYTES>>1); - - for_each_vcpu(d, v) - { - if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) - continue; - ASSERT(shadow2_mode_external(v->domain)); - - SHADOW2_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n", - d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e)); - - if ( v == current ) /* OK to use linear map of monitor_table */ - ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START); - else - { - l3_pgentry_t *ml3e; - ml3e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); - ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT); - ml2e = sh2_map_domain_page(_mfn(l3e_get_pfn(ml3e[3]))); - ml2e += l2_table_offset(RO_MPT_VIRT_START); - sh2_unmap_domain_page(ml3e); - } - ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR); - if ( v != current ) - sh2_unmap_domain_page(ml2e); - } -} -#endif - -// Find the next level's P2M entry, checking for out-of-range gfn's... -// Returns NULL on error. -// -static l1_pgentry_t * -p2m_find_entry(void *table, unsigned long *gfn_remainder, - unsigned long gfn, u32 shift, u32 max) -{ - u32 index; - - index = *gfn_remainder >> shift; - if ( index >= max ) - { - SHADOW2_DEBUG(P2M, "gfn=0x%lx out of range " - "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n", - gfn, *gfn_remainder, shift, index, max); - return NULL; - } - *gfn_remainder &= (1 << shift) - 1; - return (l1_pgentry_t *)table + index; -} - -// Walk one level of the P2M table, allocating a new table if required. -// Returns 0 on error. -// -static int -p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table, - unsigned long *gfn_remainder, unsigned long gfn, u32 shift, - u32 max, unsigned long type) -{ - l1_pgentry_t *p2m_entry; - void *next; - - if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, - shift, max)) ) - return 0; - - if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) ) - { - mfn_t mfn = shadow2_alloc_p2m_page(d); - if ( mfn_x(mfn) == 0 ) - return 0; - *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER); - mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated; - mfn_to_page(mfn)->count_info = 1; -#if CONFIG_PAGING_LEVELS == 3 - if (type == PGT_l2_page_table) - { - /* We have written to the p2m l3: need to sync the per-vcpu - * copies of it in the monitor tables */ - p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry); - } -#endif - /* The P2M can be shadowed: keep the shadows synced */ - if ( d->vcpu[0] ) - (void)__shadow2_validate_guest_entry(d->vcpu[0], *table_mfn, - p2m_entry, sizeof *p2m_entry); - } - *table_mfn = _mfn(l1e_get_pfn(*p2m_entry)); - next = sh2_map_domain_page(*table_mfn); - sh2_unmap_domain_page(*table); - *table = next; - - return 1; -} - -// Returns 0 on error (out of memory) -int -shadow2_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) -{ - // XXX -- this might be able to be faster iff current->domain == d - mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table); - void *table = sh2_map_domain_page(table_mfn); - unsigned long gfn_remainder = gfn; - l1_pgentry_t *p2m_entry; - -#if CONFIG_PAGING_LEVELS >= 4 - if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, - L4_PAGETABLE_SHIFT - PAGE_SHIFT, - L4_PAGETABLE_ENTRIES, PGT_l3_page_table) ) - return 0; -#endif -#if CONFIG_PAGING_LEVELS >= 3 - // When using PAE Xen, we only allow 33 bits of pseudo-physical - // address in translated guests (i.e. 8 GBytes). This restriction - // comes from wanting to map the P2M table into the 16MB RO_MPT hole - // in Xen's address space for translated PV guests. - // - if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, - L3_PAGETABLE_SHIFT - PAGE_SHIFT, - (CONFIG_PAGING_LEVELS == 3 - ? 8 - : L3_PAGETABLE_ENTRIES), - PGT_l2_page_table) ) - return 0; -#endif - if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, - L2_PAGETABLE_SHIFT - PAGE_SHIFT, - L2_PAGETABLE_ENTRIES, PGT_l1_page_table) ) - return 0; - - p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, - 0, L1_PAGETABLE_ENTRIES); - ASSERT(p2m_entry); - if ( valid_mfn(mfn) ) - *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER); - else - *p2m_entry = l1e_empty(); - - /* The P2M can be shadowed: keep the shadows synced */ - (void) __shadow2_validate_guest_entry(d->vcpu[0], table_mfn, - p2m_entry, sizeof *p2m_entry); - - sh2_unmap_domain_page(table); - - return 1; -} - -// Allocate a new p2m table for a domain. -// -// The structure of the p2m table is that of a pagetable for xen (i.e. it is -// controlled by CONFIG_PAGING_LEVELS). -// -// Returns 0 if p2m table could not be initialized -// -static int -shadow2_alloc_p2m_table(struct domain *d) -{ - mfn_t p2m_top; - struct list_head *entry; - unsigned int page_count = 0; - - SHADOW2_PRINTK("allocating p2m table\n"); - ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0); - - p2m_top = shadow2_alloc_p2m_page(d); - mfn_to_page(p2m_top)->count_info = 1; - mfn_to_page(p2m_top)->u.inuse.type_info = -#if CONFIG_PAGING_LEVELS == 4 - PGT_l4_page_table -#elif CONFIG_PAGING_LEVELS == 3 - PGT_l3_page_table -#elif CONFIG_PAGING_LEVELS == 2 - PGT_l2_page_table -#endif - | 1 | PGT_validated; - - if ( mfn_x(p2m_top) == 0 ) - return 0; - - d->arch.phys_table = pagetable_from_mfn(p2m_top); - - SHADOW2_PRINTK("populating p2m table\n"); - - for ( entry = d->page_list.next; - entry != &d->page_list; - entry = entry->next ) - { - struct page_info *page = list_entry(entry, struct page_info, list); - mfn_t mfn = page_to_mfn(page); - unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn)); - page_count++; - if ( -#ifdef __x86_64__ - (gfn != 0x5555555555555555L) -#else - (gfn != 0x55555555L) -#endif - && gfn != INVALID_M2P_ENTRY - && !shadow2_set_p2m_entry(d, gfn, mfn) ) - { - SHADOW2_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" SH2_PRI_mfn "\n", - gfn, mfn_x(mfn)); - return 0; - } - } - - SHADOW2_PRINTK("p2m table initialised (%u pages)\n", page_count); - return 1; -} - -mfn_t -sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn) -/* Read another domain's p2m entries */ -{ - mfn_t mfn; - unsigned long addr = gpfn << PAGE_SHIFT; - l2_pgentry_t *l2e; - l1_pgentry_t *l1e; - - ASSERT(shadow2_mode_translate(d)); - mfn = pagetable_get_mfn(d->arch.phys_table); - - -#if CONFIG_PAGING_LEVELS > 2 - if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) - /* This pfn is higher than the p2m map can hold */ - return _mfn(INVALID_MFN); -#endif - - -#if CONFIG_PAGING_LEVELS >= 4 - { - l4_pgentry_t *l4e = sh2_map_domain_page(mfn); - l4e += l4_table_offset(addr); - if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 ) - { - sh2_unmap_domain_page(l4e); - return _mfn(INVALID_MFN); - } - mfn = _mfn(l4e_get_pfn(*l4e)); - sh2_unmap_domain_page(l4e); - } -#endif -#if CONFIG_PAGING_LEVELS >= 3 - { - l3_pgentry_t *l3e = sh2_map_domain_page(mfn); - l3e += l3_table_offset(addr); - if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) - { - sh2_unmap_domain_page(l3e); - return _mfn(INVALID_MFN); - } - mfn = _mfn(l3e_get_pfn(*l3e)); - sh2_unmap_domain_page(l3e); - } -#endif - - l2e = sh2_map_domain_page(mfn); - l2e += l2_table_offset(addr); - if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) - { - sh2_unmap_domain_page(l2e); - return _mfn(INVALID_MFN); - } - mfn = _mfn(l2e_get_pfn(*l2e)); - sh2_unmap_domain_page(l2e); - - l1e = sh2_map_domain_page(mfn); - l1e += l1_table_offset(addr); - if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 ) - { - sh2_unmap_domain_page(l1e); - return _mfn(INVALID_MFN); - } - mfn = _mfn(l1e_get_pfn(*l1e)); - sh2_unmap_domain_page(l1e); - - return mfn; -} - -unsigned long -shadow2_gfn_to_mfn_foreign(unsigned long gpfn) -{ - return mfn_x(sh2_gfn_to_mfn_foreign(current->domain, gpfn)); -} - - -static void shadow2_p2m_teardown(struct domain *d) -/* Return all the p2m pages to Xen. - * We know we don't have any extra mappings to these pages */ -{ - struct list_head *entry, *n; - struct page_info *pg; - - d->arch.phys_table = pagetable_null(); - - list_for_each_safe(entry, n, &d->arch.shadow2.p2m_inuse) - { - pg = list_entry(entry, struct page_info, list); - list_del(entry); - /* Should have just the one ref we gave it in alloc_p2m_page() */ - if ( (pg->count_info & PGC_SH2_count_mask) != 1 ) - { - SHADOW2_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n", - pg->count_info, pg->u.inuse.type_info); - } - ASSERT(page_get_owner(pg) == d); - /* Free should not decrement domain's total allocation, since - * these pages were allocated without an owner. */ - page_set_owner(pg, NULL); - free_domheap_pages(pg, 0); - d->arch.shadow2.p2m_pages--; - perfc_decr(shadow2_alloc_count); - } - list_for_each_safe(entry, n, &d->arch.shadow2.p2m_freelist) - { - list_del(entry); - pg = list_entry(entry, struct page_info, list); - ASSERT(page_get_owner(pg) == d); - /* Free should not decrement domain's total allocation. */ - page_set_owner(pg, NULL); - free_domheap_pages(pg, 0); - d->arch.shadow2.p2m_pages--; - perfc_decr(shadow2_alloc_count); - } - ASSERT(d->arch.shadow2.p2m_pages == 0); -} - -/* Set the pool of shadow pages to the required number of pages. - * Input will be rounded up to at least shadow2_min_acceptable_pages(), - * plus space for the p2m table. - * Returns 0 for success, non-zero for failure. */ -static unsigned int set_sh2_allocation(struct domain *d, - unsigned int pages, - int *preempted) -{ - struct page_info *pg; - unsigned int lower_bound; - int j; - - ASSERT(shadow2_lock_is_acquired(d)); - - /* Don't allocate less than the minimum acceptable, plus one page per - * megabyte of RAM (for the p2m table) */ - lower_bound = shadow2_min_acceptable_pages(d) + (d->tot_pages / 256); - if ( pages > 0 && pages < lower_bound ) - pages = lower_bound; - /* Round up to largest block size */ - pages = (pages + ((1<arch.shadow2.total_pages, pages); - - while ( d->arch.shadow2.total_pages != pages ) - { - if ( d->arch.shadow2.total_pages < pages ) - { - /* Need to allocate more memory from domheap */ - pg = alloc_domheap_pages(NULL, SHADOW2_MAX_ORDER, 0); - if ( pg == NULL ) - { - SHADOW2_PRINTK("failed to allocate shadow pages.\n"); - return -ENOMEM; - } - d->arch.shadow2.free_pages += 1<arch.shadow2.total_pages += 1<list, - &d->arch.shadow2.freelists[SHADOW2_MAX_ORDER]); - } - else if ( d->arch.shadow2.total_pages > pages ) - { - /* Need to return memory to domheap */ - shadow2_prealloc(d, SHADOW2_MAX_ORDER); - ASSERT(!list_empty(&d->arch.shadow2.freelists[SHADOW2_MAX_ORDER])); - pg = list_entry(d->arch.shadow2.freelists[SHADOW2_MAX_ORDER].next, - struct page_info, list); - list_del(&pg->list); - d->arch.shadow2.free_pages -= 1<arch.shadow2.total_pages -= 1<domain_id, - d->arch.shadow2.total_pages, - shadow2_get_allocation(d)); - shadow2_unlock(d); - return rv; -} - -/**************************************************************************/ -/* Hash table for storing the guest->shadow mappings */ - -/* Hash function that takes a gfn or mfn, plus another byte of type info */ -typedef u32 key_t; -static inline key_t sh2_hash(unsigned long n, u8 t) -{ - unsigned char *p = (unsigned char *)&n; - key_t k = t; - int i; - for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k; - return k; -} - -#if SHADOW2_AUDIT & (SHADOW2_AUDIT_HASH|SHADOW2_AUDIT_HASH_FULL) - -/* Before we get to the mechanism, define a pair of audit functions - * that sanity-check the contents of the hash table. */ -static void sh2_hash_audit_bucket(struct domain *d, int bucket) -/* Audit one bucket of the hash table */ -{ - struct shadow2_hash_entry *e, *x; - struct page_info *pg; - - if ( !(SHADOW2_AUDIT_ENABLE) ) - return; - - e = &d->arch.shadow2.hash_table[bucket]; - if ( e->t == 0 ) return; /* Bucket is empty */ - while ( e ) - { - /* Empty link? */ - BUG_ON( e->t == 0 ); - /* Bogus type? */ - BUG_ON( e->t > (PGC_SH2_max_shadow >> PGC_SH2_type_shift) ); - /* Wrong bucket? */ - BUG_ON( sh2_hash(e->n, e->t) % SHADOW2_HASH_BUCKETS != bucket ); - /* Duplicate entry? */ - for ( x = e->next; x; x = x->next ) - BUG_ON( x->n == e->n && x->t == e->t ); - /* Bogus MFN? */ - BUG_ON( !valid_mfn(e->smfn) ); - pg = mfn_to_page(e->smfn); - /* Not a shadow? */ - BUG_ON( page_get_owner(pg) != 0 ); - /* Wrong kind of shadow? */ - BUG_ON( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift - != e->t ); - /* Bad backlink? */ - BUG_ON( pg->u.inuse.type_info != e->n ); - if ( e->t != (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift) - && e->t != (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift) - && e->t != (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) ) - { - /* Bad shadow flags on guest page? */ - BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow2_flags & (1<t)) ); - } - /* That entry was OK; on we go */ - e = e->next; - } -} - -#else -#define sh2_hash_audit_bucket(_d, _b) -#endif /* Hashtable bucket audit */ - - -#if SHADOW2_AUDIT & SHADOW2_AUDIT_HASH_FULL - -static void sh2_hash_audit(struct domain *d) -/* Full audit: audit every bucket in the table */ -{ - int i; - - if ( !(SHADOW2_AUDIT_ENABLE) ) - return; - - for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ ) - { - sh2_hash_audit_bucket(d, i); - } -} - -#else -#define sh2_hash_audit(_d) -#endif /* Hashtable bucket audit */ - -/* Memory management interface for bucket allocation. - * These ought to come out of shadow memory, but at least on 32-bit - * machines we are forced to allocate them from xenheap so that we can - * address them. */ -static struct shadow2_hash_entry *sh2_alloc_hash_entry(struct domain *d) -{ - struct shadow2_hash_entry *extra, *x; - int i; - - /* We need to allocate a new node. Ensure the free list is not empty. - * Allocate new entries in units the same size as the original table. */ - if ( unlikely(d->arch.shadow2.hash_freelist == NULL) ) - { - size_t sz = sizeof(void *) + (SHADOW2_HASH_BUCKETS * sizeof(*x)); - extra = xmalloc_bytes(sz); - - if ( extra == NULL ) - { - /* No memory left! */ - SHADOW2_ERROR("xmalloc() failed when allocating hash buckets.\n"); - domain_crash_synchronous(); - } - memset(extra, 0, sz); - - /* Record the allocation block so it can be correctly freed later. */ - *((struct shadow2_hash_entry **)&extra[SHADOW2_HASH_BUCKETS]) = - d->arch.shadow2.hash_allocations; - d->arch.shadow2.hash_allocations = &extra[0]; - - /* Thread a free chain through the newly-allocated nodes. */ - for ( i = 0; i < (SHADOW2_HASH_BUCKETS - 1); i++ ) - extra[i].next = &extra[i+1]; - extra[i].next = NULL; - - /* Add the new nodes to the free list. */ - d->arch.shadow2.hash_freelist = &extra[0]; - } - - /* Allocate a new node from the free list. */ - x = d->arch.shadow2.hash_freelist; - d->arch.shadow2.hash_freelist = x->next; - return x; -} - -static void sh2_free_hash_entry(struct domain *d, struct shadow2_hash_entry *e) -{ - /* Mark the bucket as empty and return it to the free list */ - e->t = 0; - e->next = d->arch.shadow2.hash_freelist; - d->arch.shadow2.hash_freelist = e; -} - - -/* Allocate and initialise the table itself. - * Returns 0 for success, 1 for error. */ -static int shadow2_hash_alloc(struct domain *d) -{ - struct shadow2_hash_entry *table; - - ASSERT(shadow2_lock_is_acquired(d)); - ASSERT(!d->arch.shadow2.hash_table); - - table = xmalloc_array(struct shadow2_hash_entry, SHADOW2_HASH_BUCKETS); - if ( !table ) return 1; - memset(table, 0, - SHADOW2_HASH_BUCKETS * sizeof (struct shadow2_hash_entry)); - d->arch.shadow2.hash_table = table; - return 0; -} - -/* Tear down the hash table and return all memory to Xen. - * This function does not care whether the table is populated. */ -static void shadow2_hash_teardown(struct domain *d) -{ - struct shadow2_hash_entry *a, *n; - - ASSERT(shadow2_lock_is_acquired(d)); - ASSERT(d->arch.shadow2.hash_table); - - /* Return the table itself */ - xfree(d->arch.shadow2.hash_table); - d->arch.shadow2.hash_table = NULL; - - /* Return any extra allocations */ - a = d->arch.shadow2.hash_allocations; - while ( a ) - { - /* We stored a linked-list pointer at the end of each allocation */ - n = *((struct shadow2_hash_entry **)(&a[SHADOW2_HASH_BUCKETS])); - xfree(a); - a = n; - } - d->arch.shadow2.hash_allocations = NULL; - d->arch.shadow2.hash_freelist = NULL; -} - - -mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t) -/* Find an entry in the hash table. Returns the MFN of the shadow, - * or INVALID_MFN if it doesn't exist */ -{ - struct domain *d = v->domain; - struct shadow2_hash_entry *p, *x, *head; - key_t key; - - ASSERT(shadow2_lock_is_acquired(d)); - ASSERT(d->arch.shadow2.hash_table); - ASSERT(t); - - sh2_hash_audit(d); - - perfc_incrc(shadow2_hash_lookups); - key = sh2_hash(n, t); - - x = head = &d->arch.shadow2.hash_table[key % SHADOW2_HASH_BUCKETS]; - p = NULL; - - sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS); - - do - { - ASSERT(x->t || ((x == head) && (x->next == NULL))); - - if ( x->n == n && x->t == t ) - { - /* Pull-to-front if 'x' isn't already the head item */ - if ( unlikely(x != head) ) - { - if ( unlikely(d->arch.shadow2.hash_walking != 0) ) - /* Can't reorder: someone is walking the hash chains */ - return x->smfn; - else - { - /* Delete 'x' from list and reinsert after head. */ - p->next = x->next; - x->next = head->next; - head->next = x; - - /* Swap 'x' contents with head contents. */ - SWAP(head->n, x->n); - SWAP(head->t, x->t); - SWAP(head->smfn, x->smfn); - } - } - else - { - perfc_incrc(shadow2_hash_lookup_head); - } - return head->smfn; - } - - p = x; - x = x->next; - } - while ( x != NULL ); - - perfc_incrc(shadow2_hash_lookup_miss); - return _mfn(INVALID_MFN); -} - -void shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn) -/* Put a mapping (n,t)->smfn into the hash table */ -{ - struct domain *d = v->domain; - struct shadow2_hash_entry *x, *head; - key_t key; - - ASSERT(shadow2_lock_is_acquired(d)); - ASSERT(d->arch.shadow2.hash_table); - ASSERT(t); - - sh2_hash_audit(d); - - perfc_incrc(shadow2_hash_inserts); - key = sh2_hash(n, t); - - head = &d->arch.shadow2.hash_table[key % SHADOW2_HASH_BUCKETS]; - - sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS); - - /* If the bucket is empty then insert the new page as the head item. */ - if ( head->t == 0 ) - { - head->n = n; - head->t = t; - head->smfn = smfn; - ASSERT(head->next == NULL); - } - else - { - /* Insert a new entry directly after the head item. */ - x = sh2_alloc_hash_entry(d); - x->n = n; - x->t = t; - x->smfn = smfn; - x->next = head->next; - head->next = x; - } - - sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS); -} - -void shadow2_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn) -/* Excise the mapping (n,t)->smfn from the hash table */ -{ - struct domain *d = v->domain; - struct shadow2_hash_entry *p, *x, *head; - key_t key; - - ASSERT(shadow2_lock_is_acquired(d)); - ASSERT(d->arch.shadow2.hash_table); - ASSERT(t); - - sh2_hash_audit(d); - - perfc_incrc(shadow2_hash_deletes); - key = sh2_hash(n, t); - - head = &d->arch.shadow2.hash_table[key % SHADOW2_HASH_BUCKETS]; - - sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS); - - /* Match on head item? */ - if ( head->n == n && head->t == t ) - { - if ( (x = head->next) != NULL ) - { - /* Overwrite head with contents of following node. */ - head->n = x->n; - head->t = x->t; - head->smfn = x->smfn; - - /* Delete following node. */ - head->next = x->next; - sh2_free_hash_entry(d, x); - } - else - { - /* This bucket is now empty. Initialise the head node. */ - head->t = 0; - } - } - else - { - /* Not at the head; need to walk the chain */ - p = head; - x = head->next; - - while(1) - { - ASSERT(x); /* We can't have hit the end, since our target is - * still in the chain somehwere... */ - if ( x->n == n && x->t == t ) - { - /* Delete matching node. */ - p->next = x->next; - sh2_free_hash_entry(d, x); - break; - } - p = x; - x = x->next; - } - } - - sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS); -} - -typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn); - -static void hash_foreach(struct vcpu *v, - unsigned int callback_mask, - hash_callback_t callbacks[], - mfn_t callback_mfn) -/* Walk the hash table looking at the types of the entries and - * calling the appropriate callback function for each entry. - * The mask determines which shadow types we call back for, and the array - * of callbacks tells us which function to call. - * Any callback may return non-zero to let us skip the rest of the scan. - * - * WARNING: Callbacks MUST NOT add or remove hash entries unless they - * then return non-zero to terminate the scan. */ -{ - int i, done = 0; - struct domain *d = v->domain; - struct shadow2_hash_entry *x; - - /* Say we're here, to stop hash-lookups reordering the chains */ - ASSERT(shadow2_lock_is_acquired(d)); - ASSERT(d->arch.shadow2.hash_walking == 0); - d->arch.shadow2.hash_walking = 1; - - callback_mask &= ~1; /* Never attempt to call back on empty buckets */ - for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ ) - { - /* WARNING: This is not safe against changes to the hash table. - * The callback *must* return non-zero if it has inserted or - * deleted anything from the hash (lookups are OK, though). */ - for ( x = &d->arch.shadow2.hash_table[i]; x; x = x->next ) - { - if ( callback_mask & (1 << x->t) ) - { - ASSERT(x->t <= 15); - ASSERT(callbacks[x->t] != NULL); - if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 ) - break; - } - } - if ( done ) break; - } - d->arch.shadow2.hash_walking = 0; -} - - -/**************************************************************************/ -/* Destroy a shadow page: simple dispatcher to call the per-type destructor - * which will decrement refcounts appropriately and return memory to the - * free pool. */ - -void sh2_destroy_shadow(struct vcpu *v, mfn_t smfn) -{ - struct page_info *pg = mfn_to_page(smfn); - u32 t = pg->count_info & PGC_SH2_type_mask; - - - SHADOW2_PRINTK("smfn=%#lx\n", mfn_x(smfn)); - - /* Double-check, if we can, that the shadowed page belongs to this - * domain, (by following the back-pointer). */ - ASSERT(t == PGC_SH2_fl1_32_shadow || - t == PGC_SH2_fl1_pae_shadow || - t == PGC_SH2_fl1_64_shadow || - t == PGC_SH2_monitor_table || - (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info))) - == v->domain)); - - /* The down-shifts here are so that the switch statement is on nice - * small numbers that the compiler will enjoy */ - switch ( t >> PGC_SH2_type_shift ) - { -#if CONFIG_PAGING_LEVELS == 2 - case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift: - case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift: - SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 2, 2)(v, smfn); - break; - case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift: - SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 2, 2)(v, smfn); - break; -#else /* PAE or 64bit */ - case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift: - case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift: - SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 2)(v, smfn); - break; - case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift: - SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 2)(v, smfn); - break; -#endif - -#if CONFIG_PAGING_LEVELS >= 3 - case PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift: - case PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift: - SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 3)(v, smfn); - break; - case PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift: - case PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift: - SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 3)(v, smfn); - break; - case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift: - SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 3, 3)(v, smfn); - break; -#endif - -#if CONFIG_PAGING_LEVELS >= 4 - case PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift: - case PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift: - SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 4, 4)(v, smfn); - break; - case PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift: - SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 4, 4)(v, smfn); - break; - case PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift: - SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 4, 4)(v, smfn); - break; - case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift: - SHADOW2_INTERNAL_NAME(sh2_destroy_l4_shadow, 4, 4)(v, smfn); - break; -#endif - default: - SHADOW2_PRINTK("tried to destroy shadow of bad type %08lx\n", - (unsigned long)t); - BUG(); - } -} - -/**************************************************************************/ -/* Remove all writeable mappings of a guest frame from the shadow tables - * Returns non-zero if we need to flush TLBs. - * level and fault_addr desribe how we found this to be a pagetable; - * level==0 means we have some other reason for revoking write access.*/ - -int shadow2_remove_write_access(struct vcpu *v, mfn_t gmfn, - unsigned int level, - unsigned long fault_addr) -{ - /* Dispatch table for getting per-type functions */ - static hash_callback_t callbacks[16] = { - NULL, /* none */ -#if CONFIG_PAGING_LEVELS == 2 - SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* l1_32 */ - SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* fl1_32 */ -#else - SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* l1_32 */ - SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* fl1_32 */ -#endif - NULL, /* l2_32 */ -#if CONFIG_PAGING_LEVELS >= 3 - SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* l1_pae */ - SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* fl1_pae */ -#else - NULL, /* l1_pae */ - NULL, /* fl1_pae */ -#endif - NULL, /* l2_pae */ - NULL, /* l2h_pae */ - NULL, /* l3_pae */ -#if CONFIG_PAGING_LEVELS >= 4 - SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* l1_64 */ - SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* fl1_64 */ -#else - NULL, /* l1_64 */ - NULL, /* fl1_64 */ -#endif - NULL, /* l2_64 */ - NULL, /* l3_64 */ - NULL, /* l4_64 */ - NULL, /* p2m */ - NULL /* unused */ - }; - - static unsigned int callback_mask = - 1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift) - | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift) - | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift) - | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift) - | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift) - | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) - ; - struct page_info *pg = mfn_to_page(gmfn); - - ASSERT(shadow2_lock_is_acquired(v->domain)); - - /* Only remove writable mappings if we are doing shadow refcounts. - * In guest refcounting, we trust Xen to already be restricting - * all the writes to the guest page tables, so we do not need to - * do more. */ - if ( !shadow2_mode_refcounts(v->domain) ) - return 0; - - /* Early exit if it's already a pagetable, or otherwise not writeable */ - if ( sh2_mfn_is_a_page_table(gmfn) - || (pg->u.inuse.type_info & PGT_count_mask) == 0 ) - return 0; - - perfc_incrc(shadow2_writeable); - - /* If this isn't a "normal" writeable page, the domain is trying to - * put pagetables in special memory of some kind. We can't allow that. */ - if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page ) - { - SHADOW2_ERROR("can't remove write access to mfn %lx, type_info is %" - PRtype_info "\n", - mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info); - domain_crash(v->domain); - } - -#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC - if ( v == current && level != 0 ) - { - unsigned long gfn; - /* Heuristic: there is likely to be only one writeable mapping, - * and that mapping is likely to be in the current pagetable, - * either in the guest's linear map (linux, windows) or in a - * magic slot used to map high memory regions (linux HIGHTPTE) */ - -#define GUESS(_a, _h) do { \ - if ( v->arch.shadow2.mode->guess_wrmap(v, (_a), gmfn) ) \ - perfc_incrc(shadow2_writeable_h_ ## _h); \ - if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \ - return 1; \ - } while (0) - - - /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */ - if ( v == current - && (gfn = sh2_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 ) - GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4); - - if ( v->arch.shadow2.mode->guest_levels == 2 ) - { - if ( level == 1 ) - /* 32bit non-PAE w2k3: linear map at 0xC0000000 */ - GUESS(0xC0000000UL + (fault_addr >> 10), 1); - } -#if CONFIG_PAGING_LEVELS >= 3 - else if ( v->arch.shadow2.mode->guest_levels == 3 ) - { - /* 32bit PAE w2k3: linear map at 0xC0000000 */ - switch ( level ) - { - case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break; - case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break; - } - } -#if CONFIG_PAGING_LEVELS >= 4 - else if ( v->arch.shadow2.mode->guest_levels == 4 ) - { - /* 64bit w2k3: linear map at 0x0000070000000000 */ - switch ( level ) - { - case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break; - case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break; - case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break; - } - } -#endif /* CONFIG_PAGING_LEVELS >= 4 */ -#endif /* CONFIG_PAGING_LEVELS >= 3 */ - -#undef GUESS - - } -#endif - - /* Brute-force search of all the shadows, by walking the hash */ - perfc_incrc(shadow2_writeable_bf); - hash_foreach(v, callback_mask, callbacks, gmfn); - - /* If that didn't catch the mapping, something is very wrong */ - if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 ) - { - SHADOW2_ERROR("can't find all writeable mappings of mfn %lx: " - "%lu left\n", mfn_x(gmfn), - (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask)); - domain_crash(v->domain); - } - - /* We killed at least one writeable mapping, so must flush TLBs. */ - return 1; -} - - - -/**************************************************************************/ -/* Remove all mappings of a guest frame from the shadow tables. - * Returns non-zero if we need to flush TLBs. */ - -int shadow2_remove_all_mappings(struct vcpu *v, mfn_t gmfn) -{ - struct page_info *page = mfn_to_page(gmfn); - int expected_count; - - /* Dispatch table for getting per-type functions */ - static hash_callback_t callbacks[16] = { - NULL, /* none */ -#if CONFIG_PAGING_LEVELS == 2 - SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* l1_32 */ - SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* fl1_32 */ -#else - SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* l1_32 */ - SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* fl1_32 */ -#endif - NULL, /* l2_32 */ -#if CONFIG_PAGING_LEVELS >= 3 - SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* l1_pae */ - SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* fl1_pae */ -#else - NULL, /* l1_pae */ - NULL, /* fl1_pae */ -#endif - NULL, /* l2_pae */ - NULL, /* l2h_pae */ - NULL, /* l3_pae */ -#if CONFIG_PAGING_LEVELS >= 4 - SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* l1_64 */ - SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* fl1_64 */ -#else - NULL, /* l1_64 */ - NULL, /* fl1_64 */ -#endif - NULL, /* l2_64 */ - NULL, /* l3_64 */ - NULL, /* l4_64 */ - NULL, /* p2m */ - NULL /* unused */ - }; - - static unsigned int callback_mask = - 1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift) - | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift) - | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift) - | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift) - | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift) - | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) - ; - - perfc_incrc(shadow2_mappings); - if ( (page->count_info & PGC_count_mask) == 0 ) - return 0; - - ASSERT(shadow2_lock_is_acquired(v->domain)); - - /* XXX TODO: - * Heuristics for finding the (probably) single mapping of this gmfn */ - - /* Brute-force search of all the shadows, by walking the hash */ - perfc_incrc(shadow2_mappings_bf); - hash_foreach(v, callback_mask, callbacks, gmfn); - - /* If that didn't catch the mapping, something is very wrong */ - expected_count = (page->count_info & PGC_allocated) ? 1 : 0; - if ( (page->count_info & PGC_count_mask) != expected_count ) - { - /* Don't complain if we're in HVM and there's one extra mapping: - * The qemu helper process has an untyped mapping of this dom's RAM */ - if ( !(shadow2_mode_external(v->domain) - && (page->count_info & PGC_count_mask) <= 2 - && (page->u.inuse.type_info & PGT_count_mask) == 0) ) - { - SHADOW2_ERROR("can't find all mappings of mfn %lx: " - "c=%08x t=%08lx\n", mfn_x(gmfn), - page->count_info, page->u.inuse.type_info); - } - } - - /* We killed at least one mapping, so must flush TLBs. */ - return 1; -} - - -/**************************************************************************/ -/* Remove all shadows of a guest frame from the shadow tables */ - -static int sh2_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn) -/* Follow this shadow's up-pointer, if it has one, and remove the reference - * found there. Returns 1 if that was the only reference to this shadow */ -{ - struct page_info *pg = mfn_to_page(smfn); - mfn_t pmfn; - void *vaddr; - int rc; - - ASSERT((pg->count_info & PGC_SH2_type_mask) > 0); - ASSERT((pg->count_info & PGC_SH2_type_mask) < PGC_SH2_max_shadow); - ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l2_32_shadow); - ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l3_pae_shadow); - ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l4_64_shadow); - - if (pg->up == 0) return 0; - pmfn = _mfn(pg->up >> PAGE_SHIFT); - ASSERT(valid_mfn(pmfn)); - vaddr = sh2_map_domain_page(pmfn); - ASSERT(vaddr); - vaddr += pg->up & (PAGE_SIZE-1); - ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn)); - - /* Is this the only reference to this shadow? */ - rc = ((pg->count_info & PGC_SH2_count_mask) == 1) ? 1 : 0; - - /* Blank the offending entry */ - switch ((pg->count_info & PGC_SH2_type_mask)) - { - case PGC_SH2_l1_32_shadow: - case PGC_SH2_l2_32_shadow: -#if CONFIG_PAGING_LEVELS == 2 - SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,2,2)(v, vaddr, pmfn); -#else - SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,2)(v, vaddr, pmfn); -#endif - break; -#if CONFIG_PAGING_LEVELS >=3 - case PGC_SH2_l1_pae_shadow: - case PGC_SH2_l2_pae_shadow: - case PGC_SH2_l2h_pae_shadow: - case PGC_SH2_l3_pae_shadow: - SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,3)(v, vaddr, pmfn); - break; -#if CONFIG_PAGING_LEVELS >= 4 - case PGC_SH2_l1_64_shadow: - case PGC_SH2_l2_64_shadow: - case PGC_SH2_l3_64_shadow: - case PGC_SH2_l4_64_shadow: - SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,4,4)(v, vaddr, pmfn); - break; -#endif -#endif - default: BUG(); /* Some wierd unknown shadow type */ - } - - sh2_unmap_domain_page(vaddr); - if ( rc ) - perfc_incrc(shadow2_up_pointer); - else - perfc_incrc(shadow2_unshadow_bf); - - return rc; -} - -void sh2_remove_shadows(struct vcpu *v, mfn_t gmfn, int all) -/* Remove the shadows of this guest page. - * If all != 0, find all shadows, if necessary by walking the tables. - * Otherwise, just try the (much faster) heuristics, which will remove - * at most one reference to each shadow of the page. */ -{ - struct page_info *pg; - mfn_t smfn; - u32 sh_flags; - unsigned char t; - - /* Dispatch table for getting per-type functions: each level must - * be called with the function to remove a lower-level shadow. */ - static hash_callback_t callbacks[16] = { - NULL, /* none */ - NULL, /* l1_32 */ - NULL, /* fl1_32 */ -#if CONFIG_PAGING_LEVELS == 2 - SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,2,2), /* l2_32 */ -#else - SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,2), /* l2_32 */ -#endif - NULL, /* l1_pae */ - NULL, /* fl1_pae */ -#if CONFIG_PAGING_LEVELS >= 3 - SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2_pae */ - SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2h_pae */ - SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,3,3), /* l3_pae */ -#else - NULL, /* l2_pae */ - NULL, /* l2h_pae */ - NULL, /* l3_pae */ -#endif - NULL, /* l1_64 */ - NULL, /* fl1_64 */ -#if CONFIG_PAGING_LEVELS >= 4 - SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,4,4), /* l2_64 */ - SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,4,4), /* l3_64 */ - SHADOW2_INTERNAL_NAME(sh2_remove_l3_shadow,4,4), /* l4_64 */ -#else - NULL, /* l2_64 */ - NULL, /* l3_64 */ - NULL, /* l4_64 */ -#endif - NULL, /* p2m */ - NULL /* unused */ - }; - - /* Another lookup table, for choosing which mask to use */ - static unsigned int masks[16] = { - 0, /* none */ - 1 << (PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift), /* l1_32 */ - 0, /* fl1_32 */ - 0, /* l2_32 */ - ((1 << (PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift)) - | (1 << (PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift))), /* l1_pae */ - 0, /* fl1_pae */ - 1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2_pae */ - 1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2h_pae */ - 0, /* l3_pae */ - 1 << (PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift), /* l1_64 */ - 0, /* fl1_64 */ - 1 << (PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift), /* l2_64 */ - 1 << (PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift), /* l3_64 */ - 0, /* l4_64 */ - 0, /* p2m */ - 0 /* unused */ - }; - - ASSERT(shadow2_lock_is_acquired(v->domain)); - - pg = mfn_to_page(gmfn); - - /* Bale out now if the page is not shadowed */ - if ( (pg->count_info & PGC_page_table) == 0 ) - return; - - SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx\n", - v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); - - /* Search for this shadow in all appropriate shadows */ - perfc_incrc(shadow2_unshadow); - sh_flags = pg->shadow2_flags; - - /* Lower-level shadows need to be excised from upper-level shadows. - * This call to hash_foreach() looks dangerous but is in fact OK: each - * call will remove at most one shadow, and terminate immediately when - * it does remove it, so we never walk the hash after doing a deletion. */ -#define DO_UNSHADOW(_type) do { \ - t = (_type) >> PGC_SH2_type_shift; \ - smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t); \ - if ( !sh2_remove_shadow_via_pointer(v, smfn) && all ) \ - hash_foreach(v, masks[t], callbacks, smfn); \ -} while (0) - - /* Top-level shadows need to be unpinned */ -#define DO_UNPIN(_type) do { \ - t = (_type) >> PGC_SH2_type_shift; \ - smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t); \ - if ( mfn_to_page(smfn)->count_info & PGC_SH2_pinned ) \ - sh2_unpin(v, smfn); \ - if ( (_type) == PGC_SH2_l3_pae_shadow ) \ - SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn); \ -} while (0) - - if ( sh_flags & SH2F_L1_32 ) DO_UNSHADOW(PGC_SH2_l1_32_shadow); - if ( sh_flags & SH2F_L2_32 ) DO_UNPIN(PGC_SH2_l2_32_shadow); -#if CONFIG_PAGING_LEVELS >= 3 - if ( sh_flags & SH2F_L1_PAE ) DO_UNSHADOW(PGC_SH2_l1_pae_shadow); - if ( sh_flags & SH2F_L2_PAE ) DO_UNSHADOW(PGC_SH2_l2_pae_shadow); - if ( sh_flags & SH2F_L2H_PAE ) DO_UNSHADOW(PGC_SH2_l2h_pae_shadow); - if ( sh_flags & SH2F_L3_PAE ) DO_UNPIN(PGC_SH2_l3_pae_shadow); -#if CONFIG_PAGING_LEVELS >= 4 - if ( sh_flags & SH2F_L1_64 ) DO_UNSHADOW(PGC_SH2_l1_64_shadow); - if ( sh_flags & SH2F_L2_64 ) DO_UNSHADOW(PGC_SH2_l2_64_shadow); - if ( sh_flags & SH2F_L3_64 ) DO_UNSHADOW(PGC_SH2_l3_64_shadow); - if ( sh_flags & SH2F_L4_64 ) DO_UNPIN(PGC_SH2_l4_64_shadow); -#endif -#endif - -#undef DO_UNSHADOW -#undef DO_UNPIN - - -#if CONFIG_PAGING_LEVELS > 2 - /* We may have caused some PAE l3 entries to change: need to - * fix up the copies of them in various places */ - if ( sh_flags & (SH2F_L2_PAE|SH2F_L2H_PAE) ) - sh2_pae_recopy(v->domain); -#endif - - /* If that didn't catch the shadows, something is wrong */ - if ( all && (pg->count_info & PGC_page_table) ) - { - SHADOW2_ERROR("can't find all shadows of mfn %05lx (shadow2_flags=%08x)\n", - mfn_x(gmfn), pg->shadow2_flags); - domain_crash(v->domain); - } -} - -void -shadow2_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn) -/* Even harsher: this is a HVM page that we thing is no longer a pagetable. - * Unshadow it, and recursively unshadow pages that reference it. */ -{ - shadow2_remove_all_shadows(v, gmfn); - /* XXX TODO: - * Rework this hashtable walker to return a linked-list of all - * the shadows it modified, then do breadth-first recursion - * to find the way up to higher-level tables and unshadow them too. - * - * The current code (just tearing down each page's shadows as we - * detect that it is not a pagetable) is correct, but very slow. - * It means extra emulated writes and slows down removal of mappings. */ -} - -/**************************************************************************/ - -void sh2_update_paging_modes(struct vcpu *v) -{ - struct domain *d = v->domain; - struct shadow2_paging_mode *old_mode = v->arch.shadow2.mode; - mfn_t old_guest_table; - - ASSERT(shadow2_lock_is_acquired(d)); - - // Valid transitions handled by this function: - // - For PV guests: - // - after a shadow mode has been changed - // - For HVM guests: - // - after a shadow mode has been changed - // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE - // - - // Avoid determining the current shadow2 mode for uninitialized CPUs, as - // we can not yet determine whether it is an HVM or PV domain. - // - if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) ) - { - printk("%s: postponing determination of shadow2 mode\n", __func__); - return; - } - - // First, tear down any old shadow tables held by this vcpu. - // - shadow2_detach_old_tables(v); - - if ( !hvm_guest(v) ) - { - /// - /// PV guest - /// -#if CONFIG_PAGING_LEVELS == 4 - if ( pv_32bit_guest(v) ) - v->arch.shadow2.mode = &SHADOW2_INTERNAL_NAME(sh2_paging_mode,4,3); - else - v->arch.shadow2.mode = &SHADOW2_INTERNAL_NAME(sh2_paging_mode,4,4); -#elif CONFIG_PAGING_LEVELS == 3 - v->arch.shadow2.mode = &SHADOW2_INTERNAL_NAME(sh2_paging_mode,3,3); -#elif CONFIG_PAGING_LEVELS == 2 - v->arch.shadow2.mode = &SHADOW2_INTERNAL_NAME(sh2_paging_mode,2,2); -#else -#error unexpected paging mode -#endif - } - else - { - /// - /// HVM guest - /// - ASSERT(shadow2_mode_translate(d)); - ASSERT(shadow2_mode_external(d)); - - v->arch.shadow2.hvm_paging_enabled = !!hvm_paging_enabled(v); - if ( !v->arch.shadow2.hvm_paging_enabled ) - { - - /* Set v->arch.guest_table to use the p2m map, and choose - * the appropriate shadow mode */ - old_guest_table = pagetable_get_mfn(v->arch.guest_table); -#if CONFIG_PAGING_LEVELS == 2 - v->arch.guest_table = - pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table)); - v->arch.shadow2.mode = &SHADOW2_INTERNAL_NAME(sh2_paging_mode,2,2); -#elif CONFIG_PAGING_LEVELS == 3 - v->arch.guest_table = - pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table)); - v->arch.shadow2.mode = &SHADOW2_INTERNAL_NAME(sh2_paging_mode,3,3); -#else /* CONFIG_PAGING_LEVELS == 4 */ - { - l4_pgentry_t *l4e; - /* Use the start of the first l3 table as a PAE l3 */ - ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0); - l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); - ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT); - v->arch.guest_table = - pagetable_from_pfn(l4e_get_pfn(l4e[0])); - sh2_unmap_domain_page(l4e); - } - v->arch.shadow2.mode = &SHADOW2_INTERNAL_NAME(sh2_paging_mode,3,3); -#endif - /* Fix up refcounts on guest_table */ - get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d); - if ( mfn_x(old_guest_table) != 0 ) - put_page(mfn_to_page(old_guest_table)); - } - else - { -#ifdef __x86_64__ - if ( hvm_long_mode_enabled(v) ) - { - // long mode guest... - v->arch.shadow2.mode = - &SHADOW2_INTERNAL_NAME(sh2_paging_mode, 4, 4); - } - else -#endif - if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE ) - { -#if CONFIG_PAGING_LEVELS >= 3 - // 32-bit PAE mode guest... - v->arch.shadow2.mode = - &SHADOW2_INTERNAL_NAME(sh2_paging_mode, 3, 3); -#else - SHADOW2_ERROR("PAE not supported in 32-bit Xen\n"); - domain_crash(d); - return; -#endif - } - else - { - // 32-bit 2 level guest... -#if CONFIG_PAGING_LEVELS >= 3 - v->arch.shadow2.mode = - &SHADOW2_INTERNAL_NAME(sh2_paging_mode, 3, 2); -#else - v->arch.shadow2.mode = - &SHADOW2_INTERNAL_NAME(sh2_paging_mode, 2, 2); -#endif - } - } - - if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) - { - mfn_t mmfn = shadow2_make_monitor_table(v); - v->arch.monitor_table = pagetable_from_mfn(mmfn); - v->arch.monitor_vtable = sh2_map_domain_page(mmfn); - } - - if ( v->arch.shadow2.mode != old_mode ) - { - SHADOW2_PRINTK("new paging mode: d=%u v=%u g=%u s=%u " - "(was g=%u s=%u)\n", - d->domain_id, v->vcpu_id, - v->arch.shadow2.mode->guest_levels, - v->arch.shadow2.mode->shadow_levels, - old_mode ? old_mode->guest_levels : 0, - old_mode ? old_mode->shadow_levels : 0); - if ( old_mode && - (v->arch.shadow2.mode->shadow_levels != - old_mode->shadow_levels) ) - { - /* Need to make a new monitor table for the new mode */ - mfn_t new_mfn, old_mfn; - - if ( v != current ) - { - SHADOW2_ERROR("Some third party (d=%u v=%u) is changing " - "this HVM vcpu's (d=%u v=%u) paging mode!\n", - current->domain->domain_id, current->vcpu_id, - v->domain->domain_id, v->vcpu_id); - domain_crash(v->domain); - return; - } - - sh2_unmap_domain_page(v->arch.monitor_vtable); - old_mfn = pagetable_get_mfn(v->arch.monitor_table); - v->arch.monitor_table = pagetable_null(); - new_mfn = v->arch.shadow2.mode->make_monitor_table(v); - v->arch.monitor_table = pagetable_from_mfn(new_mfn); - v->arch.monitor_vtable = sh2_map_domain_page(new_mfn); - SHADOW2_PRINTK("new monitor table %"SH2_PRI_mfn "\n", - mfn_x(new_mfn)); - - /* Don't be running on the old monitor table when we - * pull it down! Switch CR3, and warn the HVM code that - * its host cr3 has changed. */ - make_cr3(v, mfn_x(new_mfn)); - write_ptbase(v); - hvm_update_host_cr3(v); - old_mode->destroy_monitor_table(v, old_mfn); - } - } - - // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE. - // These are HARD: think about the case where two CPU's have - // different values for CR4.PSE and CR4.PGE at the same time. - // This *does* happen, at least for CR4.PGE... - } - - v->arch.shadow2.mode->update_cr3(v); -} - -/**************************************************************************/ -/* Turning on and off shadow2 features */ - -static void sh2_new_mode(struct domain *d, u32 new_mode) -/* Inform all the vcpus that the shadow mode has been changed */ -{ - struct vcpu *v; - - ASSERT(shadow2_lock_is_acquired(d)); - ASSERT(d != current->domain); - d->arch.shadow2.mode = new_mode; - if ( new_mode & SHM2_translate ) - shadow2_audit_p2m(d); - for_each_vcpu(d, v) - sh2_update_paging_modes(v); -} - -static int shadow2_enable(struct domain *d, u32 mode) -/* Turn on "permanent" shadow features: external, translate, refcount. - * Can only be called once on a domain, and these features cannot be - * disabled. - * Returns 0 for success, -errno for failure. */ -{ - unsigned int old_pages; - int rv = 0; - - mode |= SHM2_enable; - - domain_pause(d); - shadow2_lock(d); - - /* Sanity check the arguments */ - if ( (d == current->domain) || - shadow2_mode_enabled(d) || - ((mode & SHM2_external) && !(mode & SHM2_translate)) ) - { - rv = -EINVAL; - goto out; - } - - // XXX -- eventually would like to require that all memory be allocated - // *after* shadow2_enabled() is called... So here, we would test to make - // sure that d->page_list is empty. -#if 0 - spin_lock(&d->page_alloc_lock); - if ( !list_empty(&d->page_list) ) - { - spin_unlock(&d->page_alloc_lock); - rv = -EINVAL; - goto out; - } - spin_unlock(&d->page_alloc_lock); -#endif - - /* Init the shadow memory allocation if the user hasn't done so */ - old_pages = d->arch.shadow2.total_pages; - if ( old_pages == 0 ) - if ( set_sh2_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */ - { - set_sh2_allocation(d, 0, NULL); - rv = -ENOMEM; - goto out; - } - - /* Init the hash table */ - if ( shadow2_hash_alloc(d) != 0 ) - { - set_sh2_allocation(d, old_pages, NULL); - rv = -ENOMEM; - goto out; - } - - /* Init the P2M table */ - if ( mode & SHM2_translate ) - if ( !shadow2_alloc_p2m_table(d) ) - { - shadow2_hash_teardown(d); - set_sh2_allocation(d, old_pages, NULL); - shadow2_p2m_teardown(d); - rv = -ENOMEM; - goto out; - } - - /* Update the bits */ - sh2_new_mode(d, mode); - shadow2_audit_p2m(d); - out: - shadow2_unlock(d); - domain_unpause(d); - return 0; -} - -void shadow2_teardown(struct domain *d) -/* Destroy the shadow pagetables of this domain and free its shadow memory. - * Should only be called for dying domains. */ -{ - struct vcpu *v; - mfn_t mfn; - - ASSERT(test_bit(_DOMF_dying, &d->domain_flags)); - ASSERT(d != current->domain); - - if ( !shadow2_lock_is_acquired(d) ) - shadow2_lock(d); /* Keep various asserts happy */ - - if ( shadow2_mode_enabled(d) ) - { - /* Release the shadow and monitor tables held by each vcpu */ - for_each_vcpu(d, v) - { - shadow2_detach_old_tables(v); - if ( shadow2_mode_external(d) ) - { - mfn = pagetable_get_mfn(v->arch.monitor_table); - if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) ) - shadow2_destroy_monitor_table(v, mfn); - v->arch.monitor_table = pagetable_null(); - } - } - } - - if ( d->arch.shadow2.total_pages != 0 ) - { - SHADOW2_PRINTK("teardown of domain %u starts." - " Shadow pages total = %u, free = %u, p2m=%u\n", - d->domain_id, - d->arch.shadow2.total_pages, - d->arch.shadow2.free_pages, - d->arch.shadow2.p2m_pages); - /* Destroy all the shadows and release memory to domheap */ - set_sh2_allocation(d, 0, NULL); - /* Release the hash table back to xenheap */ - if (d->arch.shadow2.hash_table) - shadow2_hash_teardown(d); - /* Release the log-dirty bitmap of dirtied pages */ - sh2_free_log_dirty_bitmap(d); - /* Should not have any more memory held */ - SHADOW2_PRINTK("teardown done." - " Shadow pages total = %u, free = %u, p2m=%u\n", - d->arch.shadow2.total_pages, - d->arch.shadow2.free_pages, - d->arch.shadow2.p2m_pages); - ASSERT(d->arch.shadow2.total_pages == 0); - } - - /* We leave the "permanent" shadow modes enabled, but clear the - * log-dirty mode bit. We don't want any more mark_dirty() - * calls now that we've torn down the bitmap */ - d->arch.shadow2.mode &= ~SHM2_log_dirty; - - shadow2_unlock(d); -} - -void shadow2_final_teardown(struct domain *d) -/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */ -{ - - SHADOW2_PRINTK("dom %u final teardown starts." - " Shadow pages total = %u, free = %u, p2m=%u\n", - d->domain_id, - d->arch.shadow2.total_pages, - d->arch.shadow2.free_pages, - d->arch.shadow2.p2m_pages); - - /* Double-check that the domain didn't have any shadow memory. - * It is possible for a domain that never got domain_kill()ed - * to get here with its shadow allocation intact. */ - if ( d->arch.shadow2.total_pages != 0 ) - shadow2_teardown(d); - - /* It is now safe to pull down the p2m map. */ - if ( d->arch.shadow2.p2m_pages != 0 ) - shadow2_p2m_teardown(d); - - SHADOW2_PRINTK("dom %u final teardown done." - " Shadow pages total = %u, free = %u, p2m=%u\n", - d->domain_id, - d->arch.shadow2.total_pages, - d->arch.shadow2.free_pages, - d->arch.shadow2.p2m_pages); -} - -static int shadow2_one_bit_enable(struct domain *d, u32 mode) -/* Turn on a single shadow mode feature */ -{ - ASSERT(shadow2_lock_is_acquired(d)); - - /* Sanity check the call */ - if ( d == current->domain || (d->arch.shadow2.mode & mode) ) - { - return -EINVAL; - } - - if ( d->arch.shadow2.mode == 0 ) - { - /* Init the shadow memory allocation and the hash table */ - if ( set_sh2_allocation(d, 1, NULL) != 0 - || shadow2_hash_alloc(d) != 0 ) - { - set_sh2_allocation(d, 0, NULL); - return -ENOMEM; - } - } - - /* Update the bits */ - sh2_new_mode(d, d->arch.shadow2.mode | mode); - - return 0; -} - -static int shadow2_one_bit_disable(struct domain *d, u32 mode) -/* Turn off a single shadow mode feature */ -{ - struct vcpu *v; - ASSERT(shadow2_lock_is_acquired(d)); - - /* Sanity check the call */ - if ( d == current->domain || !(d->arch.shadow2.mode & mode) ) - { - return -EINVAL; - } - - /* Update the bits */ - sh2_new_mode(d, d->arch.shadow2.mode & ~mode); - if ( d->arch.shadow2.mode == 0 ) - { - /* Get this domain off shadows */ - SHADOW2_PRINTK("un-shadowing of domain %u starts." - " Shadow pages total = %u, free = %u, p2m=%u\n", - d->domain_id, - d->arch.shadow2.total_pages, - d->arch.shadow2.free_pages, - d->arch.shadow2.p2m_pages); - for_each_vcpu(d, v) - { - shadow2_detach_old_tables(v); -#if CONFIG_PAGING_LEVELS == 4 - if ( !(v->arch.flags & TF_kernel_mode) ) - make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user)); - else -#endif - make_cr3(v, pagetable_get_pfn(v->arch.guest_table)); - - } - - /* Pull down the memory allocation */ - if ( set_sh2_allocation(d, 0, NULL) != 0 ) - { - // XXX - How can this occur? - // Seems like a bug to return an error now that we've - // disabled the relevant shadow mode. - // - return -ENOMEM; - } - shadow2_hash_teardown(d); - SHADOW2_PRINTK("un-shadowing of domain %u done." - " Shadow pages total = %u, free = %u, p2m=%u\n", - d->domain_id, - d->arch.shadow2.total_pages, - d->arch.shadow2.free_pages, - d->arch.shadow2.p2m_pages); - } - - return 0; -} - -/* Enable/disable ops for the "test" and "log-dirty" modes */ -int shadow2_test_enable(struct domain *d) -{ - int ret; - - domain_pause(d); - shadow2_lock(d); - - if ( shadow2_mode_enabled(d) ) - { - SHADOW2_ERROR("Don't support enabling test mode" - "on already shadowed doms\n"); - ret = -EINVAL; - goto out; - } - - ret = shadow2_one_bit_enable(d, SHM2_enable); - out: - shadow2_unlock(d); - domain_unpause(d); - - return ret; -} - -int shadow2_test_disable(struct domain *d) -{ - int ret; - - domain_pause(d); - shadow2_lock(d); - ret = shadow2_one_bit_disable(d, SHM2_enable); - shadow2_unlock(d); - domain_unpause(d); - - return ret; -} - -static int -sh2_alloc_log_dirty_bitmap(struct domain *d) -{ - ASSERT(d->arch.shadow2.dirty_bitmap == NULL); - d->arch.shadow2.dirty_bitmap_size = - (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) & - ~(BITS_PER_LONG - 1); - d->arch.shadow2.dirty_bitmap = - xmalloc_array(unsigned long, - d->arch.shadow2.dirty_bitmap_size / BITS_PER_LONG); - if ( d->arch.shadow2.dirty_bitmap == NULL ) - { - d->arch.shadow2.dirty_bitmap_size = 0; - return -ENOMEM; - } - memset(d->arch.shadow2.dirty_bitmap, 0, d->arch.shadow2.dirty_bitmap_size/8); - - return 0; -} - -static void -sh2_free_log_dirty_bitmap(struct domain *d) -{ - d->arch.shadow2.dirty_bitmap_size = 0; - if ( d->arch.shadow2.dirty_bitmap ) - { - xfree(d->arch.shadow2.dirty_bitmap); - d->arch.shadow2.dirty_bitmap = NULL; - } -} - -static int shadow2_log_dirty_enable(struct domain *d) -{ - int ret; - - domain_pause(d); - shadow2_lock(d); - - if ( shadow2_mode_log_dirty(d) ) - { - ret = -EINVAL; - goto out; - } - - if ( shadow2_mode_enabled(d) ) - { - SHADOW2_ERROR("Don't (yet) support enabling log-dirty" - "on already shadowed doms\n"); - ret = -EINVAL; - goto out; - } - - ret = sh2_alloc_log_dirty_bitmap(d); - if ( ret != 0 ) - { - sh2_free_log_dirty_bitmap(d); - goto out; - } - - ret = shadow2_one_bit_enable(d, SHM2_log_dirty); - if ( ret != 0 ) - sh2_free_log_dirty_bitmap(d); - - out: - shadow2_unlock(d); - domain_unpause(d); - return ret; -} - -static int shadow2_log_dirty_disable(struct domain *d) -{ - int ret; - - domain_pause(d); - shadow2_lock(d); - ret = shadow2_one_bit_disable(d, SHM2_log_dirty); - if ( !shadow2_mode_log_dirty(d) ) - sh2_free_log_dirty_bitmap(d); - shadow2_unlock(d); - domain_unpause(d); - - return ret; -} - -/**************************************************************************/ -/* P2M map manipulations */ - -static void -sh2_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn) -{ - struct vcpu *v; - - if ( !shadow2_mode_translate(d) ) - return; - - v = current; - if ( v->domain != d ) - v = d->vcpu[0]; - - - SHADOW2_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn); - - ASSERT(mfn_x(sh2_gfn_to_mfn(d, gfn)) == mfn); - //ASSERT(sh2_mfn_to_gfn(d, mfn) == gfn); - - shadow2_remove_all_shadows_and_parents(v, _mfn(mfn)); - if ( shadow2_remove_all_mappings(v, _mfn(mfn)) ) - flush_tlb_mask(d->domain_dirty_cpumask); - shadow2_set_p2m_entry(d, gfn, _mfn(INVALID_MFN)); - set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); -} - -void -shadow2_guest_physmap_remove_page(struct domain *d, unsigned long gfn, - unsigned long mfn) -{ - shadow2_lock(d); - shadow2_audit_p2m(d); - sh2_p2m_remove_page(d, gfn, mfn); - shadow2_audit_p2m(d); - shadow2_unlock(d); -} - -void -shadow2_guest_physmap_add_page(struct domain *d, unsigned long gfn, - unsigned long mfn) -{ - struct vcpu *v; - unsigned long ogfn; - mfn_t omfn; - - if ( !shadow2_mode_translate(d) ) - return; - - v = current; - if ( v->domain != d ) - v = d->vcpu[0]; - - shadow2_lock(d); - shadow2_audit_p2m(d); - - SHADOW2_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn); - - omfn = sh2_gfn_to_mfn(d, gfn); - if ( valid_mfn(omfn) ) - { - /* Get rid of the old mapping, especially any shadows */ - shadow2_remove_all_shadows_and_parents(v, omfn); - if ( shadow2_remove_all_mappings(v, omfn) ) - flush_tlb_mask(d->domain_dirty_cpumask); - set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); - } - - ogfn = sh2_mfn_to_gfn(d, _mfn(mfn)); - if ( -#ifdef __x86_64__ - (ogfn != 0x5555555555555555L) -#else - (ogfn != 0x55555555L) -#endif - && (ogfn != INVALID_M2P_ENTRY) - && (ogfn != gfn) ) - { - /* This machine frame is already mapped at another physical address */ - SHADOW2_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n", - mfn, ogfn, gfn); - if ( valid_mfn(omfn = sh2_gfn_to_mfn(d, ogfn)) ) - { - SHADOW2_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n", - ogfn , mfn_x(omfn)); - if ( mfn_x(omfn) == mfn ) - sh2_p2m_remove_page(d, ogfn, mfn); - } - } - - shadow2_set_p2m_entry(d, gfn, _mfn(mfn)); - set_gpfn_from_mfn(mfn, gfn); - shadow2_audit_p2m(d); - shadow2_unlock(d); -} - -/**************************************************************************/ -/* Log-dirty mode support */ - -/* Convert a shadow to log-dirty mode. */ -void shadow2_convert_to_log_dirty(struct vcpu *v, mfn_t smfn) -{ - BUG(); -} - - -/* Read a domain's log-dirty bitmap and stats. - * If the operation is a CLEAN, clear the bitmap and stats as well. */ -static int shadow2_log_dirty_op( - struct domain *d, struct xen_domctl_shadow_op *sc) -{ - int i, rv = 0, clean = 0; - - domain_pause(d); - shadow2_lock(d); - - clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN); - - SHADOW2_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n", - (clean) ? "clean" : "peek", - d->domain_id, - d->arch.shadow2.fault_count, - d->arch.shadow2.dirty_count); - - sc->stats.fault_count = d->arch.shadow2.fault_count; - sc->stats.dirty_count = d->arch.shadow2.dirty_count; - - if ( clean ) - { - struct list_head *l, *t; - struct page_info *pg; - - /* Need to revoke write access to the domain's pages again. - * In future, we'll have a less heavy-handed approach to this, - * but for now, we just unshadow everything except Xen. */ - list_for_each_safe(l, t, &d->arch.shadow2.toplevel_shadows) - { - pg = list_entry(l, struct page_info, list); - shadow2_unhook_mappings(d->vcpu[0], page_to_mfn(pg)); - } - - d->arch.shadow2.fault_count = 0; - d->arch.shadow2.dirty_count = 0; - } - - if ( guest_handle_is_null(sc->dirty_bitmap) || - (d->arch.shadow2.dirty_bitmap == NULL) ) - { - rv = -EINVAL; - goto out; - } - - if ( sc->pages > d->arch.shadow2.dirty_bitmap_size ) - sc->pages = d->arch.shadow2.dirty_bitmap_size; - -#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */ - for ( i = 0; i < sc->pages; i += CHUNK ) - { - int bytes = ((((sc->pages - i) > CHUNK) - ? CHUNK - : (sc->pages - i)) + 7) / 8; - - if ( copy_to_guest_offset( - sc->dirty_bitmap, - i/(8*sizeof(unsigned long)), - d->arch.shadow2.dirty_bitmap + (i/(8*sizeof(unsigned long))), - (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) ) - { - rv = -EINVAL; - goto out; - } - - if ( clean ) - memset(d->arch.shadow2.dirty_bitmap + (i/(8*sizeof(unsigned long))), - 0, bytes); - } -#undef CHUNK - - out: - shadow2_unlock(d); - domain_unpause(d); - return 0; -} - - -/* Mark a page as dirty */ -void sh2_do_mark_dirty(struct domain *d, mfn_t gmfn) -{ - unsigned long pfn; - - ASSERT(shadow2_lock_is_acquired(d)); - ASSERT(shadow2_mode_log_dirty(d)); - - if ( !valid_mfn(gmfn) ) - return; - - ASSERT(d->arch.shadow2.dirty_bitmap != NULL); - - /* We /really/ mean PFN here, even for non-translated guests. */ - pfn = get_gpfn_from_mfn(mfn_x(gmfn)); - - /* - * Values with the MSB set denote MFNs that aren't really part of the - * domain's pseudo-physical memory map (e.g., the shared info frame). - * Nothing to do here... - */ - if ( unlikely(!VALID_M2P(pfn)) ) - return; - - /* N.B. Can use non-atomic TAS because protected by shadow2_lock. */ - if ( likely(pfn < d->arch.shadow2.dirty_bitmap_size) ) - { - if ( !__test_and_set_bit(pfn, d->arch.shadow2.dirty_bitmap) ) - { - SHADOW2_DEBUG(LOGDIRTY, - "marked mfn %" SH2_PRI_mfn " (pfn=%lx), dom %d\n", - mfn_x(gmfn), pfn, d->domain_id); - d->arch.shadow2.dirty_count++; - } - } - else - { - SHADOW2_PRINTK("mark_dirty OOR! " - "mfn=%" SH2_PRI_mfn " pfn=%lx max=%x (dom %d)\n" - "owner=%d c=%08x t=%" PRtype_info "\n", - mfn_x(gmfn), - pfn, - d->arch.shadow2.dirty_bitmap_size, - d->domain_id, - (page_get_owner(mfn_to_page(gmfn)) - ? page_get_owner(mfn_to_page(gmfn))->domain_id - : -1), - mfn_to_page(gmfn)->count_info, - mfn_to_page(gmfn)->u.inuse.type_info); - } -} - - -/**************************************************************************/ -/* Shadow-control XEN_DOMCTL dispatcher */ - -int shadow2_domctl(struct domain *d, - xen_domctl_shadow_op_t *sc, - XEN_GUEST_HANDLE(xen_domctl_t) u_domctl) -{ - int rc, preempted = 0; - - if ( unlikely(d == current->domain) ) - { - DPRINTK("Don't try to do a shadow op on yourself!\n"); - return -EINVAL; - } - - switch ( sc->op ) - { - case XEN_DOMCTL_SHADOW_OP_OFF: - if ( shadow2_mode_log_dirty(d) ) - if ( (rc = shadow2_log_dirty_disable(d)) != 0 ) - return rc; - if ( d->arch.shadow2.mode & SHM2_enable ) - if ( (rc = shadow2_test_disable(d)) != 0 ) - return rc; - return 0; - - case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST: - return shadow2_test_enable(d); - - case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY: - return shadow2_log_dirty_enable(d); - - case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE: - return shadow2_enable(d, SHM2_refcounts|SHM2_translate); - - case XEN_DOMCTL_SHADOW_OP_CLEAN: - case XEN_DOMCTL_SHADOW_OP_PEEK: - return shadow2_log_dirty_op(d, sc); - - case XEN_DOMCTL_SHADOW_OP_ENABLE: - if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY ) - return shadow2_log_dirty_enable(d); - return shadow2_enable(d, sc->mode << SHM2_shift); - - case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: - sc->mb = shadow2_get_allocation(d); - return 0; - - case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: - rc = shadow2_set_allocation(d, sc->mb, &preempted); - if ( preempted ) - /* Not finished. Set up to re-run the call. */ - rc = hypercall_create_continuation( - __HYPERVISOR_domctl, "h", u_domctl); - else - /* Finished. Return the new allocation */ - sc->mb = shadow2_get_allocation(d); - return rc; - - default: - SHADOW2_ERROR("Bad shadow op %u\n", sc->op); - return -EINVAL; - } -} - - -/**************************************************************************/ -/* Auditing shadow tables */ - -#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL - -void shadow2_audit_tables(struct vcpu *v) -{ - /* Dispatch table for getting per-type functions */ - static hash_callback_t callbacks[16] = { - NULL, /* none */ -#if CONFIG_PAGING_LEVELS == 2 - SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,2,2), /* l1_32 */ - SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,2,2), /* fl1_32 */ - SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,2,2), /* l2_32 */ -#else - SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,2), /* l1_32 */ - SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,2), /* fl1_32 */ - SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,2), /* l2_32 */ - SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,3), /* l1_pae */ - SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,3), /* fl1_pae */ - SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3), /* l2_pae */ - SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3), /* l2h_pae */ - SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,3,3), /* l3_pae */ -#if CONFIG_PAGING_LEVELS >= 4 - SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,4,4), /* l1_64 */ - SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,4,4), /* fl1_64 */ - SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,4,4), /* l2_64 */ - SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,4,4), /* l3_64 */ - SHADOW2_INTERNAL_NAME(sh2_audit_l4_table,4,4), /* l4_64 */ -#endif /* CONFIG_PAGING_LEVELS >= 4 */ -#endif /* CONFIG_PAGING_LEVELS > 2 */ - NULL /* All the rest */ - }; - unsigned int mask; - - if ( !(SHADOW2_AUDIT_ENABLE) ) - return; - - if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL ) - mask = ~1; /* Audit every table in the system */ - else - { - /* Audit only the current mode's tables */ - switch ( v->arch.shadow2.mode->guest_levels ) - { - case 2: mask = (SH2F_L1_32|SH2F_FL1_32|SH2F_L2_32); break; - case 3: mask = (SH2F_L1_PAE|SH2F_FL1_PAE|SH2F_L2_PAE - |SH2F_L2H_PAE|SH2F_L3_PAE); break; - case 4: mask = (SH2F_L1_64|SH2F_FL1_64|SH2F_L2_64 - |SH2F_L3_64|SH2F_L4_64); break; - default: BUG(); - } - } - - hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN)); -} - -#endif /* Shadow audit */ - - -/**************************************************************************/ -/* Auditing p2m tables */ - -#if SHADOW2_AUDIT & SHADOW2_AUDIT_P2M - -void shadow2_audit_p2m(struct domain *d) -{ - struct list_head *entry; - struct page_info *page; - struct domain *od; - unsigned long mfn, gfn, m2pfn, lp2mfn = 0; - mfn_t p2mfn; - unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0; - int test_linear; - - if ( !(SHADOW2_AUDIT_ENABLE) || !shadow2_mode_translate(d) ) - return; - - //SHADOW2_PRINTK("p2m audit starts\n"); - - test_linear = ( (d == current->domain) && current->arch.monitor_vtable ); - if ( test_linear ) - local_flush_tlb(); - - /* Audit part one: walk the domain's page allocation list, checking - * the m2p entries. */ - for ( entry = d->page_list.next; - entry != &d->page_list; - entry = entry->next ) - { - page = list_entry(entry, struct page_info, list); - mfn = mfn_x(page_to_mfn(page)); - - // SHADOW2_PRINTK("auditing guest page, mfn=%#lx\n", mfn); - - od = page_get_owner(page); - - if ( od != d ) - { - SHADOW2_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n", - mfn, od, (od?od->domain_id:-1), d, d->domain_id); - continue; - } - - gfn = get_gpfn_from_mfn(mfn); - if ( gfn == INVALID_M2P_ENTRY ) - { - orphans_i++; - //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n", - // mfn); - continue; - } - - if ( gfn == 0x55555555 ) - { - orphans_d++; - //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n", - // mfn); - continue; - } - - p2mfn = sh2_gfn_to_mfn_foreign(d, gfn); - if ( mfn_x(p2mfn) != mfn ) - { - mpbad++; - SHADOW2_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx" - " (-> gfn %#lx)\n", - mfn, gfn, mfn_x(p2mfn), - (mfn_valid(p2mfn) - ? get_gpfn_from_mfn(mfn_x(p2mfn)) - : -1u)); - /* This m2p entry is stale: the domain has another frame in - * this physical slot. No great disaster, but for neatness, - * blow away the m2p entry. */ - set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); - } - - if ( test_linear ) - { - lp2mfn = get_mfn_from_gpfn(gfn); - if ( lp2mfn != mfn_x(p2mfn) ) - { - SHADOW2_PRINTK("linear mismatch gfn %#lx -> mfn %#lx " - "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn); - } - } - - // SHADOW2_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n", - // mfn, gfn, p2mfn, lp2mfn); - } - - /* Audit part two: walk the domain's p2m table, checking the entries. */ - if ( pagetable_get_pfn(d->arch.phys_table) != 0 ) - { - l2_pgentry_t *l2e; - l1_pgentry_t *l1e; - int i1, i2; - -#if CONFIG_PAGING_LEVELS == 4 - l4_pgentry_t *l4e; - l3_pgentry_t *l3e; - int i3, i4; - l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); -#elif CONFIG_PAGING_LEVELS == 3 - l3_pgentry_t *l3e; - int i3; - l3e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); -#else /* CONFIG_PAGING_LEVELS == 2 */ - l2e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); -#endif - - gfn = 0; -#if CONFIG_PAGING_LEVELS >= 3 -#if CONFIG_PAGING_LEVELS >= 4 - for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ ) - { - if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) ) - { - gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT); - continue; - } - l3e = sh2_map_domain_page(_mfn(l4e_get_pfn(l4e[i4]))); -#endif /* now at levels 3 or 4... */ - for ( i3 = 0; - i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); - i3++ ) - { - if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) ) - { - gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); - continue; - } - l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[i3]))); -#endif /* all levels... */ - for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) - { - if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) ) - { - gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); - continue; - } - l1e = sh2_map_domain_page(_mfn(l2e_get_pfn(l2e[i2]))); - - for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ ) - { - if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) ) - continue; - mfn = l1e_get_pfn(l1e[i1]); - ASSERT(valid_mfn(_mfn(mfn))); - m2pfn = get_gpfn_from_mfn(mfn); - if ( m2pfn != gfn ) - { - pmbad++; - SHADOW2_PRINTK("mismatch: gfn %#lx -> mfn %#lx" - " -> gfn %#lx\n", gfn, mfn, m2pfn); - BUG(); - } - } - sh2_unmap_domain_page(l1e); - } -#if CONFIG_PAGING_LEVELS >= 3 - sh2_unmap_domain_page(l2e); - } -#if CONFIG_PAGING_LEVELS >= 4 - sh2_unmap_domain_page(l3e); - } -#endif -#endif - -#if CONFIG_PAGING_LEVELS == 4 - sh2_unmap_domain_page(l4e); -#elif CONFIG_PAGING_LEVELS == 3 - sh2_unmap_domain_page(l3e); -#else /* CONFIG_PAGING_LEVELS == 2 */ - sh2_unmap_domain_page(l2e); -#endif - - } - - //SHADOW2_PRINTK("p2m audit complete\n"); - //if ( orphans_i | orphans_d | mpbad | pmbad ) - // SHADOW2_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n", - // orphans_i + orphans_d, orphans_i, orphans_d, - if ( mpbad | pmbad ) - SHADOW2_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n", - pmbad, mpbad); -} - -#endif /* p2m audit */ - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/xen/arch/x86/shadow2.c b/xen/arch/x86/shadow2.c deleted file mode 100644 index 943091b716..0000000000 --- a/xen/arch/x86/shadow2.c +++ /dev/null @@ -1,4492 +0,0 @@ -/****************************************************************************** - * arch/x86/shadow2.c - * - * Simple, mostly-synchronous shadow page tables. - * Parts of this code are Copyright (c) 2006 by XenSource Inc. - * Parts of this code are Copyright (c) 2006 by Michael A Fetterman - * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -// DESIGN QUESTIONS: -// Why use subshadows for PAE guests? -// - reduces pressure in the hash table -// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3) -// - would need to find space in the page_info to store 7 more bits of -// backpointer -// - independent shadows of 32 byte chunks makes it non-obvious how to quickly -// figure out when to demote the guest page from l3 status -// -// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space. -// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address -// space for both PV and HVM guests. -// - -#define SHADOW2 1 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* The first cut: an absolutely synchronous, trap-and-emulate version, - * supporting only HVM guests (and so only "external" shadow mode). - * - * THINGS TO DO LATER: - * - * FIX GVA_TO_GPA - * The current interface returns an unsigned long, which is not big enough - * to hold a physical address in PAE. Should return a gfn instead. - * - * TEARDOWN HEURISTICS - * Also: have a heuristic for when to destroy a previous paging-mode's - * shadows. When a guest is done with its start-of-day 32-bit tables - * and reuses the memory we want to drop those shadows. Start with - * shadows in a page in two modes as a hint, but beware of clever tricks - * like reusing a pagetable for both PAE and 64-bit during boot... - * - * PAE LINEAR MAPS - * Rework shadow_get_l*e() to have the option of using map_domain_page() - * instead of linear maps. Add appropriate unmap_l*e calls in the users. - * Then we can test the speed difference made by linear maps. If the - * map_domain_page() version is OK on PAE, we could maybe allow a lightweight - * l3-and-l2h-only shadow mode for PAE PV guests that would allow them - * to share l2h pages again. - * - * PAE L3 COPYING - * In this code, we copy all 32 bytes of a PAE L3 every time we change an - * entry in it, and every time we change CR3. We copy it for the linear - * mappings (ugh! PAE linear mappings) and we copy it to the low-memory - * buffer so it fits in CR3. Maybe we can avoid some of this recopying - * by using the shadow directly in some places. - * Also, for SMP, need to actually respond to seeing shadow2.pae_flip_pending. - * - * GUEST_WALK_TABLES TLB FLUSH COALESCE - * guest_walk_tables can do up to three remote TLB flushes as it walks to - * the first l1 of a new pagetable. Should coalesce the flushes to the end, - * and if we do flush, re-do the walk. If anything has changed, then - * pause all the other vcpus and do the walk *again*. - * - * WP DISABLED - * Consider how to implement having the WP bit of CR0 set to 0. - * Since we need to be able to cause write faults to pagetables, this might - * end up looking like not having the (guest) pagetables present at all in - * HVM guests... - * - * PSE disabled / PSE36 - * We don't support any modes other than PSE enabled, PSE36 disabled. - * Neither of those would be hard to change, but we'd need to be able to - * deal with shadows made in one mode and used in another. - */ - -#define FETCH_TYPE_PREFETCH 1 -#define FETCH_TYPE_DEMAND 2 -#define FETCH_TYPE_WRITE 4 -typedef enum { - ft_prefetch = FETCH_TYPE_PREFETCH, - ft_demand_read = FETCH_TYPE_DEMAND, - ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE, -} fetch_type_t; - -#ifdef DEBUG_TRACE_DUMP -static char *fetch_type_names[] = { - [ft_prefetch] "prefetch", - [ft_demand_read] "demand read", - [ft_demand_write] "demand write", -}; -#endif - -/* XXX forward declarations */ -#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) -static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res); -#endif -static inline void sh2_update_linear_entries(struct vcpu *v); - -/**************************************************************************/ -/* Hash table mapping from guest pagetables to shadows - * - * Normal case: maps the mfn of a guest page to the mfn of its shadow page. - * FL1's: maps the *gfn* of the start of a superpage to the mfn of a - * shadow L1 which maps its "splinters". - * PAE CR3s: maps the 32-byte aligned, 32-bit CR3 value to the mfn of the - * PAE L3 info page for that CR3 value. - */ - -static inline mfn_t -get_fl1_shadow_status(struct vcpu *v, gfn_t gfn) -/* Look for FL1 shadows in the hash table */ -{ - mfn_t smfn = shadow2_hash_lookup(v, gfn_x(gfn), - PGC_SH2_fl1_shadow >> PGC_SH2_type_shift); - - if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) ) - { - struct page_info *page = mfn_to_page(smfn); - if ( !(page->count_info & PGC_SH2_log_dirty) ) - shadow2_convert_to_log_dirty(v, smfn); - } - - return smfn; -} - -static inline mfn_t -get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type) -/* Look for shadows in the hash table */ -{ - mfn_t smfn = shadow2_hash_lookup(v, mfn_x(gmfn), - shadow_type >> PGC_SH2_type_shift); - perfc_incrc(shadow2_get_shadow_status); - - if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) ) - { - struct page_info *page = mfn_to_page(smfn); - if ( !(page->count_info & PGC_SH2_log_dirty) ) - shadow2_convert_to_log_dirty(v, smfn); - } - - return smfn; -} - -static inline void -set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn) -/* Put an FL1 shadow into the hash table */ -{ - SHADOW2_PRINTK("gfn=%"SH2_PRI_gfn", type=%08x, smfn=%05lx\n", - gfn_x(gfn), PGC_SH2_fl1_shadow, mfn_x(smfn)); - - if ( unlikely(shadow2_mode_log_dirty(v->domain)) ) - // mark this shadow as a log dirty shadow... - set_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info); - else - clear_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info); - - shadow2_hash_insert(v, gfn_x(gfn), - PGC_SH2_fl1_shadow >> PGC_SH2_type_shift, smfn); -} - -static inline void -set_shadow2_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn) -/* Put a shadow into the hash table */ -{ - struct domain *d = v->domain; - int res; - - SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n", - d->domain_id, v->vcpu_id, mfn_x(gmfn), - shadow_type, mfn_x(smfn)); - - if ( unlikely(shadow2_mode_log_dirty(d)) ) - // mark this shadow as a log dirty shadow... - set_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info); - else - clear_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info); - - res = get_page(mfn_to_page(gmfn), d); - ASSERT(res == 1); - - shadow2_hash_insert(v, mfn_x(gmfn), shadow_type >> PGC_SH2_type_shift, - smfn); -} - -static inline void -delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn) -/* Remove a shadow from the hash table */ -{ - SHADOW2_PRINTK("gfn=%"SH2_PRI_gfn", type=%08x, smfn=%05lx\n", - gfn_x(gfn), PGC_SH2_fl1_shadow, mfn_x(smfn)); - - shadow2_hash_delete(v, gfn_x(gfn), - PGC_SH2_fl1_shadow >> PGC_SH2_type_shift, smfn); -} - -static inline void -delete_shadow2_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn) -/* Remove a shadow from the hash table */ -{ - SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n", - v->domain->domain_id, v->vcpu_id, - mfn_x(gmfn), shadow_type, mfn_x(smfn)); - shadow2_hash_delete(v, mfn_x(gmfn), - shadow_type >> PGC_SH2_type_shift, smfn); - put_page(mfn_to_page(gmfn)); -} - -/**************************************************************************/ -/* CPU feature support querying */ - -static inline int -guest_supports_superpages(struct vcpu *v) -{ - /* The _PAGE_PSE bit must be honoured in HVM guests, whenever - * CR4.PSE is set or the guest is in PAE or long mode */ - return (hvm_guest(v) && (GUEST_PAGING_LEVELS != 2 - || (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE))); -} - -static inline int -guest_supports_nx(struct vcpu *v) -{ - if ( !hvm_guest(v) ) - return cpu_has_nx; - - // XXX - fix this! - return 1; -} - - -/**************************************************************************/ -/* Functions for walking the guest page tables */ - - -/* Walk the guest pagetables, filling the walk_t with what we see. - * Takes an uninitialised walk_t. The caller must call unmap_walk() - * on the walk_t before discarding it or calling guest_walk_tables again. - * If "guest_op" is non-zero, we are serving a genuine guest memory access, - * and must (a) be under the shadow2 lock, and (b) remove write access - * from any gueat PT pages we see, as we will be using their contents to - * perform shadow updates. - * Returns 0 for success or non-zero if the guest pagetables are malformed. - * N.B. Finding a not-present entry does not cause a non-zero return code. */ -static inline int -guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op) -{ - ASSERT(!guest_op || shadow2_lock_is_acquired(v->domain)); - - perfc_incrc(shadow2_guest_walk); - memset(gw, 0, sizeof(*gw)); - gw->va = va; - -#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ -#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ - /* Get l4e from the top level table */ - gw->l4mfn = pagetable_get_mfn(v->arch.guest_table); - gw->l4e = (guest_l4e_t *)v->arch.guest_vtable + guest_l4_table_offset(va); - /* Walk down to the l3e */ - if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0; - gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e)); - if ( !valid_mfn(gw->l3mfn) ) return 1; - /* This mfn is a pagetable: make sure the guest can't write to it. */ - if ( guest_op && shadow2_remove_write_access(v, gw->l3mfn, 3, va) != 0 ) - flush_tlb_mask(v->domain->domain_dirty_cpumask); - gw->l3e = ((guest_l3e_t *)sh2_map_domain_page(gw->l3mfn)) - + guest_l3_table_offset(va); -#else /* PAE only... */ - /* Get l3e from the top level table */ - gw->l3mfn = pagetable_get_mfn(v->arch.guest_table); - gw->l3e = (guest_l3e_t *)v->arch.guest_vtable + guest_l3_table_offset(va); -#endif /* PAE or 64... */ - /* Walk down to the l2e */ - if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0; - gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e)); - if ( !valid_mfn(gw->l2mfn) ) return 1; - /* This mfn is a pagetable: make sure the guest can't write to it. */ - if ( guest_op && shadow2_remove_write_access(v, gw->l2mfn, 2, va) != 0 ) - flush_tlb_mask(v->domain->domain_dirty_cpumask); - gw->l2e = ((guest_l2e_t *)sh2_map_domain_page(gw->l2mfn)) - + guest_l2_table_offset(va); -#else /* 32-bit only... */ - /* Get l2e from the top level table */ - gw->l2mfn = pagetable_get_mfn(v->arch.guest_table); - gw->l2e = (guest_l2e_t *)v->arch.guest_vtable + guest_l2_table_offset(va); -#endif /* All levels... */ - - if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0; - if ( guest_supports_superpages(v) && - (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) ) - { - /* Special case: this guest VA is in a PSE superpage, so there's - * no guest l1e. We make one up so that the propagation code - * can generate a shadow l1 table. Start with the gfn of the - * first 4k-page of the superpage. */ - gfn_t start = guest_l2e_get_gfn(*gw->l2e); - /* Grant full access in the l1e, since all the guest entry's - * access controls are enforced in the shadow l2e. This lets - * us reflect l2 changes later without touching the l1s. */ - int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| - _PAGE_ACCESSED|_PAGE_DIRTY); - /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7 - * of the level 1 */ - if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) ) - flags |= _PAGE_PAT; - /* Increment the pfn by the right number of 4k pages. - * The ~0x1 is to mask out the PAT bit mentioned above. */ - start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va)); - gw->eff_l1e = guest_l1e_from_gfn(start, flags); - gw->l1e = NULL; - gw->l1mfn = _mfn(INVALID_MFN); - } - else - { - /* Not a superpage: carry on and find the l1e. */ - gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e)); - if ( !valid_mfn(gw->l1mfn) ) return 1; - /* This mfn is a pagetable: make sure the guest can't write to it. */ - if ( guest_op - && shadow2_remove_write_access(v, gw->l1mfn, 1, va) != 0 ) - flush_tlb_mask(v->domain->domain_dirty_cpumask); - gw->l1e = ((guest_l1e_t *)sh2_map_domain_page(gw->l1mfn)) - + guest_l1_table_offset(va); - gw->eff_l1e = *gw->l1e; - } - - return 0; -} - -/* Given a walk_t, translate the gw->va into the guest's notion of the - * corresponding frame number. */ -static inline gfn_t -guest_walk_to_gfn(walk_t *gw) -{ - if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) ) - return _gfn(INVALID_GFN); - return guest_l1e_get_gfn(gw->eff_l1e); -} - -/* Given a walk_t, translate the gw->va into the guest's notion of the - * corresponding physical address. */ -static inline paddr_t -guest_walk_to_gpa(walk_t *gw) -{ - if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) ) - return 0; - return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK); -} - - -/* Unmap (and reinitialise) a guest walk. - * Call this to dispose of any walk filled in by guest_walk_tables() */ -static void unmap_walk(struct vcpu *v, walk_t *gw) -{ -#if GUEST_PAGING_LEVELS >= 3 -#if GUEST_PAGING_LEVELS >= 4 - if ( gw->l3e != NULL ) sh2_unmap_domain_page(gw->l3e); -#endif - if ( gw->l2e != NULL ) sh2_unmap_domain_page(gw->l2e); -#endif - if ( gw->l1e != NULL ) sh2_unmap_domain_page(gw->l1e); -#ifdef DEBUG - memset(gw, 0, sizeof(*gw)); -#endif -} - - -/* Pretty-print the contents of a guest-walk */ -static inline void print_gw(walk_t *gw) -{ - SHADOW2_PRINTK("GUEST WALK TO %#lx:\n", gw->va); -#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ -#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ - SHADOW2_PRINTK(" l4mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l4mfn)); - SHADOW2_PRINTK(" l4e=%p\n", gw->l4e); - if ( gw->l4e ) - SHADOW2_PRINTK(" *l4e=%" SH2_PRI_gpte "\n", gw->l4e->l4); -#endif /* PAE or 64... */ - SHADOW2_PRINTK(" l3mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l3mfn)); - SHADOW2_PRINTK(" l3e=%p\n", gw->l3e); - if ( gw->l3e ) - SHADOW2_PRINTK(" *l3e=%" SH2_PRI_gpte "\n", gw->l3e->l3); -#endif /* All levels... */ - SHADOW2_PRINTK(" l2mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l2mfn)); - SHADOW2_PRINTK(" l2e=%p\n", gw->l2e); - if ( gw->l2e ) - SHADOW2_PRINTK(" *l2e=%" SH2_PRI_gpte "\n", gw->l2e->l2); - SHADOW2_PRINTK(" l1mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l1mfn)); - SHADOW2_PRINTK(" l1e=%p\n", gw->l1e); - if ( gw->l1e ) - SHADOW2_PRINTK(" *l1e=%" SH2_PRI_gpte "\n", gw->l1e->l1); - SHADOW2_PRINTK(" eff_l1e=%" SH2_PRI_gpte "\n", gw->eff_l1e.l1); -} - - -#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES -/* Lightweight audit: pass all the shadows associated with this guest walk - * through the audit mechanisms */ -static void sh2_audit_gw(struct vcpu *v, walk_t *gw) -{ - mfn_t smfn; - - if ( !(SHADOW2_AUDIT_ENABLE) ) - return; - -#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ -#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ - if ( valid_mfn(gw->l4mfn) - && valid_mfn((smfn = get_shadow_status(v, gw->l4mfn, - PGC_SH2_l4_shadow))) ) - (void) sh2_audit_l4_table(v, smfn, _mfn(INVALID_MFN)); -#endif /* PAE or 64... */ - if ( valid_mfn(gw->l3mfn) - && valid_mfn((smfn = get_shadow_status(v, gw->l3mfn, - PGC_SH2_l3_shadow))) ) - (void) sh2_audit_l3_table(v, smfn, _mfn(INVALID_MFN)); -#endif /* All levels... */ - if ( valid_mfn(gw->l2mfn) ) - { - if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn, - PGC_SH2_l2_shadow))) ) - (void) sh2_audit_l2_table(v, smfn, _mfn(INVALID_MFN)); -#if GUEST_PAGING_LEVELS == 3 - if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn, - PGC_SH2_l2h_shadow))) ) - (void) sh2_audit_l2_table(v, smfn, _mfn(INVALID_MFN)); -#endif - } - if ( valid_mfn(gw->l1mfn) - && valid_mfn((smfn = get_shadow_status(v, gw->l1mfn, - PGC_SH2_l1_shadow))) ) - (void) sh2_audit_l1_table(v, smfn, _mfn(INVALID_MFN)); - else if ( gw->l2e - && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) - && valid_mfn( - (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) ) - (void) sh2_audit_fl1_table(v, smfn, _mfn(INVALID_MFN)); -} - -#else -#define sh2_audit_gw(_v, _gw) do {} while(0) -#endif /* audit code */ - - - -/**************************************************************************/ -/* Function to write to the guest tables, for propagating accessed and - * dirty bits from the shadow to the guest. - * Takes a guest mfn, a pointer to the guest entry, the level of pagetable, - * and an operation type. The guest entry is always passed as an l1e: - * since we only ever write flags, that's OK. - * Returns the new flag bits of the guest entry. */ - -static u32 guest_set_ad_bits(struct vcpu *v, - mfn_t gmfn, - guest_l1e_t *ep, - unsigned int level, - fetch_type_t ft) -{ - u32 flags, shflags, bit; - struct page_info *pg; - int res = 0; - - ASSERT(valid_mfn(gmfn) - && (sh2_mfn_is_a_page_table(gmfn) - || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) - == 0))); - ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1))); - ASSERT(level <= GUEST_PAGING_LEVELS); - ASSERT(ft == ft_demand_read || ft == ft_demand_write); - ASSERT(shadow2_lock_is_acquired(v->domain)); - - flags = guest_l1e_get_flags(*ep); - - /* PAE l3s do not have A and D bits */ - if ( unlikely(GUEST_PAGING_LEVELS == 3 && level == 3) ) - return flags; - - /* Need the D bit as well for writes, in l1es and 32bit/PAE PSE l2es. */ - if ( ft == ft_demand_write - && (level == 1 || - (level == 2 && GUEST_PAGING_LEVELS < 4 - && (flags & _PAGE_PSE) && guest_supports_superpages(v))) ) - { - if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) - == (_PAGE_DIRTY | _PAGE_ACCESSED) ) - return flags; /* Guest already has A and D bits set */ - flags |= _PAGE_DIRTY | _PAGE_ACCESSED; - perfc_incrc(shadow2_ad_update); - } - else - { - if ( flags & _PAGE_ACCESSED ) - return flags; /* Guest already has A bit set */ - flags |= _PAGE_ACCESSED; - perfc_incrc(shadow2_a_update); - } - - /* Set the bit(s) */ - sh2_mark_dirty(v->domain, gmfn); - SHADOW2_DEBUG(A_AND_D, "gfn = %"SH2_PRI_gfn", " - "old flags = %#x, new flags = %#x\n", - guest_l1e_get_gfn(*ep), guest_l1e_get_flags(*ep), flags); - *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags); - - /* May need to propagate this change forward to other kinds of shadow */ - pg = mfn_to_page(gmfn); - if ( !sh2_mfn_is_a_page_table(gmfn) ) - { - /* This guest pagetable is not yet shadowed at all. */ - // MAF: I think this assert is busted... If this gmfn has not yet - // been promoted, then it seems perfectly reasonable for there to be - // outstanding type refs to it... - /* TJD: No. If the gmfn has not been promoted, we must at least - * have recognised that it is a pagetable, and pulled write access. - * The type count should only be non-zero if it is actually a page - * table. The test above was incorrect, though, so I've fixed it. */ - ASSERT((pg->u.inuse.type_info & PGT_count_mask) == 0); - return flags; - } - - shflags = pg->shadow2_flags & SH2F_page_type_mask; - while ( shflags ) - { - bit = find_first_set_bit(shflags); - ASSERT(shflags & (1u << bit)); - shflags &= ~(1u << bit); - if ( !(pg->shadow2_flags & (1u << bit)) ) - continue; - switch ( bit ) - { - case PGC_SH2_type_to_index(PGC_SH2_l1_shadow): - if (level != 1) - res |= sh2_map_and_validate_gl1e(v, gmfn, ep, sizeof (*ep)); - break; - case PGC_SH2_type_to_index(PGC_SH2_l2_shadow): - if (level != 2) - res |= sh2_map_and_validate_gl2e(v, gmfn, ep, sizeof (*ep)); - break; -#if GUEST_PAGING_LEVELS == 3 /* PAE only */ - case PGC_SH2_type_to_index(PGC_SH2_l2h_shadow): - if (level != 2) - res |= sh2_map_and_validate_gl2he(v, gmfn, ep, sizeof (*ep)); - break; -#endif -#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ - case PGC_SH2_type_to_index(PGC_SH2_l3_shadow): - if (level != 3) - res |= sh2_map_and_validate_gl3e(v, gmfn, ep, sizeof (*ep)); - break; -#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ - case PGC_SH2_type_to_index(PGC_SH2_l4_shadow): - if (level != 4) - res |= sh2_map_and_validate_gl4e(v, gmfn, ep, sizeof (*ep)); - break; -#endif -#endif - default: - SHADOW2_ERROR("mfn %"SH2_PRI_mfn" is shadowed in multiple " - "modes: A&D bits may be out of sync (flags=%#x).\n", - mfn_x(gmfn), pg->shadow2_flags); - /* XXX Shadows in other modes will not be updated, so will - * have their A and D bits out of sync. */ - } - } - - /* We should never need to flush the TLB or recopy PAE entries */ - ASSERT( res == 0 || res == SHADOW2_SET_CHANGED ); - return flags; -} - -/**************************************************************************/ -/* Functions to compute the correct index into a shadow page, given an - * index into the guest page (as returned by guest_get_index()). - * This is trivial when the shadow and guest use the same sized PTEs, but - * gets more interesting when those sizes are mismatched (e.g. 32-bit guest, - * PAE- or 64-bit shadows). - * - * These functions also increment the shadow mfn, when necessary. When PTE - * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1 - * page. In this case, we allocate 2 contiguous pages for the shadow L1, and - * use simple pointer arithmetic on a pointer to the guest L1e to figure out - * which shadow page we really want. Similarly, when PTE sizes are - * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest - * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address - * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address - * space.) - * - * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes - * of shadow (to store both the shadow, and the info that would normally be - * stored in page_info fields). This arrangement allows the shadow and the - * "page_info" fields to always be stored in the same page (in fact, in - * the same cache line), avoiding an extra call to map_domain_page(). - */ - -static inline u32 -guest_index(void *ptr) -{ - return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t); -} - -static inline u32 -shadow_l1_index(mfn_t *smfn, u32 guest_index) -{ -#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2) - *smfn = _mfn(mfn_x(*smfn) + - (guest_index / SHADOW_L1_PAGETABLE_ENTRIES)); - return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES); -#else - return guest_index; -#endif -} - -static inline u32 -shadow_l2_index(mfn_t *smfn, u32 guest_index) -{ -#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2) - // Because we use 2 shadow l2 entries for each guest entry, the number of - // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2 - // - *smfn = _mfn(mfn_x(*smfn) + - (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2))); - - // We multiple by two to get the index of the first of the two entries - // used to shadow the specified guest entry. - return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2; -#else - return guest_index; -#endif -} - -#if GUEST_PAGING_LEVELS >= 3 - -static inline u32 -shadow_l3_index(mfn_t *smfn, u32 guest_index) -{ -#if GUEST_PAGING_LEVELS == 3 - u32 group_id; - - // Because we use twice the space in L3 shadows as was consumed in guest - // L3s, the number of guest entries per shadow page is - // SHADOW_L2_PAGETABLE_ENTRIES/2. (Note this is *not* - // SHADOW_L3_PAGETABLE_ENTRIES, which in this case is 4...) - // - *smfn = _mfn(mfn_x(*smfn) + - (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2))); - - // We store PAE L3 shadows in groups of 4, alternating shadows and - // pae_l3_bookkeeping structs. So the effective shadow index is - // the the group_id * 8 + the offset within the group. - // - guest_index %= (SHADOW_L2_PAGETABLE_ENTRIES / 2); - group_id = guest_index / 4; - return (group_id * 8) + (guest_index % 4); -#else - return guest_index; -#endif -} - -#endif // GUEST_PAGING_LEVELS >= 3 - -#if GUEST_PAGING_LEVELS >= 4 - -static inline u32 -shadow_l4_index(mfn_t *smfn, u32 guest_index) -{ - return guest_index; -} - -#endif // GUEST_PAGING_LEVELS >= 4 - - -/**************************************************************************/ -/* Functions which compute shadow entries from their corresponding guest - * entries. - * - * These are the "heart" of the shadow code. - * - * There are two sets of these: those that are called on demand faults (read - * faults and write faults), and those that are essentially called to - * "prefetch" (or propagate) entries from the guest into the shadow. The read - * fault and write fault are handled as two separate cases for L1 entries (due - * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together - * into the respective demand_fault functions. - */ - -#define CHECK(_cond) \ -do { \ - if (unlikely(!(_cond))) \ - { \ - printk("%s %s %d ASSERTION (%s) FAILED\n", \ - __func__, __FILE__, __LINE__, #_cond); \ - return -1; \ - } \ -} while (0); - -// The function below tries to capture all of the flag manipulation for the -// demand and propagate functions into one place. -// -static always_inline u32 -sh2_propagate_flags(struct vcpu *v, mfn_t target_mfn, - u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn, - int mmio, int level, fetch_type_t ft) -{ - struct domain *d = v->domain; - u32 pass_thru_flags; - u32 sflags; - - // XXX -- might want to think about PAT support for HVM guests... - -#ifndef NDEBUG - // MMIO can only occur from L1e's - // - if ( mmio ) - CHECK(level == 1); - - // We should always have a pointer to the guest entry if it's a non-PSE - // non-MMIO demand access. - if ( ft & FETCH_TYPE_DEMAND ) - CHECK(guest_entry_ptr || level == 1); -#endif - - // A not-present guest entry has a special signature in the shadow table, - // so that we do not have to consult the guest tables multiple times... - // - if ( unlikely(!(gflags & _PAGE_PRESENT)) ) - return _PAGE_SHADOW_GUEST_NOT_PRESENT; - - // Must have a valid target_mfn, unless this is mmio, or unless this is a - // prefetch. In the case of a prefetch, an invalid mfn means that we can - // not usefully shadow anything, and so we return early. - // - if ( !valid_mfn(target_mfn) ) - { - CHECK((ft == ft_prefetch) || mmio); - if ( !mmio ) - return 0; - } - - // PAE does not allow NX, RW, USER, ACCESSED, or DIRTY bits in its L3e's... - // - if ( (SHADOW_PAGING_LEVELS == 3) && (level == 3) ) - pass_thru_flags = _PAGE_PRESENT; - else - { - pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER | - _PAGE_RW | _PAGE_PRESENT); - if ( guest_supports_nx(v) ) - pass_thru_flags |= _PAGE_NX_BIT; - } - - // PAE guests can not put NX, RW, USER, ACCESSED, or DIRTY bits into their - // L3e's; they are all implied. So we emulate them here. - // - if ( (GUEST_PAGING_LEVELS == 3) && (level == 3) ) - gflags = pass_thru_flags; - - // Propagate bits from the guest to the shadow. - // Some of these may be overwritten, below. - // Since we know the guest's PRESENT bit is set, we also set the shadow's - // SHADOW_PRESENT bit. - // - sflags = (gflags & pass_thru_flags) | _PAGE_SHADOW_PRESENT; - - // Copy the guest's RW bit into the SHADOW_RW bit. - // - if ( gflags & _PAGE_RW ) - sflags |= _PAGE_SHADOW_RW; - - // Set the A&D bits for higher level shadows. - // Higher level entries do not, strictly speaking, have dirty bits, but - // since we use shadow linear tables, each of these entries may, at some - // point in time, also serve as a shadow L1 entry. - // By setting both the A&D bits in each of these, we eliminate the burden - // on the hardware to update these bits on initial accesses. - // - if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) ) - sflags |= _PAGE_ACCESSED | _PAGE_DIRTY; - - - // Set the A and D bits in the guest entry, if we need to. - if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) ) - gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft); - - // If the A or D bit has not yet been set in the guest, then we must - // prevent the corresponding kind of access. - // - if ( unlikely(!((GUEST_PAGING_LEVELS == 3) && (level == 3)) && - !(gflags & _PAGE_ACCESSED)) ) - sflags &= ~_PAGE_PRESENT; - - /* D bits exist in l1es, and 32bit/PAE PSE l2es, but not 64bit PSE l2es */ - if ( unlikely( ((level == 1) - || ((level == 2) && (GUEST_PAGING_LEVELS < 4) - && guest_supports_superpages(v) && - (gflags & _PAGE_PSE))) - && !(gflags & _PAGE_DIRTY)) ) - sflags &= ~_PAGE_RW; - - // MMIO caching - // - // MMIO mappings are marked as not present, but we set the SHADOW_MMIO bit - // to cache the fact that this entry is in MMIO space. - // - if ( (level == 1) && mmio ) - { - sflags &= ~(_PAGE_PRESENT); - sflags |= _PAGE_SHADOW_MMIO; - } - else - { - // shadow2_mode_log_dirty support - // - // Only allow the guest write access to a page a) on a demand fault, - // or b) if the page is already marked as dirty. - // - if ( unlikely((level == 1) && - !(ft & FETCH_TYPE_WRITE) && - shadow2_mode_log_dirty(d) && - !sh2_mfn_is_dirty(d, target_mfn)) ) - { - sflags &= ~_PAGE_RW; - } - - // protect guest page tables - // - if ( unlikely((level == 1) && - sh2_mfn_is_a_page_table(target_mfn)) ) - { - if ( shadow2_mode_trap_reads(d) ) - { - // if we are trapping both reads & writes, then mark this page - // as not present... - // - sflags &= ~_PAGE_PRESENT; - } - else - { - // otherwise, just prevent any writes... - // - sflags &= ~_PAGE_RW; - } - } - } - - return sflags; -} - -#undef CHECK - -#if GUEST_PAGING_LEVELS >= 4 -static void -l4e_propagate_from_guest(struct vcpu *v, - guest_l4e_t *gl4e, - mfn_t gl4mfn, - mfn_t sl3mfn, - shadow_l4e_t *sl4p, - fetch_type_t ft) -{ - u32 gflags = guest_l4e_get_flags(*gl4e); - u32 sflags = sh2_propagate_flags(v, sl3mfn, gflags, (guest_l1e_t *) gl4e, - gl4mfn, 0, 4, ft); - - *sl4p = shadow_l4e_from_mfn(sl3mfn, sflags); - - SHADOW2_DEBUG(PROPAGATE, - "%s gl4e=%" SH2_PRI_gpte " sl4e=%" SH2_PRI_pte "\n", - fetch_type_names[ft], gl4e->l4, sl4p->l4); - ASSERT(sflags != -1); -} -#endif // GUEST_PAGING_LEVELS >= 4 - -#if GUEST_PAGING_LEVELS >= 3 -static void -l3e_propagate_from_guest(struct vcpu *v, - guest_l3e_t *gl3e, - mfn_t gl3mfn, - mfn_t sl2mfn, - shadow_l3e_t *sl3p, - fetch_type_t ft) -{ - u32 gflags = guest_l3e_get_flags(*gl3e); - u32 sflags = sh2_propagate_flags(v, sl2mfn, gflags, (guest_l1e_t *) gl3e, - gl3mfn, 0, 3, ft); - - *sl3p = shadow_l3e_from_mfn(sl2mfn, sflags); - - SHADOW2_DEBUG(PROPAGATE, - "%s gl3e=%" SH2_PRI_gpte " sl3e=%" SH2_PRI_pte "\n", - fetch_type_names[ft], gl3e->l3, sl3p->l3); - ASSERT(sflags != -1); -} -#endif // GUEST_PAGING_LEVELS >= 3 - -static void -l2e_propagate_from_guest(struct vcpu *v, - guest_l2e_t *gl2e, - mfn_t gl2mfn, - mfn_t sl1mfn, - shadow_l2e_t *sl2p, - fetch_type_t ft) -{ - u32 gflags = guest_l2e_get_flags(*gl2e); - u32 sflags = sh2_propagate_flags(v, sl1mfn, gflags, (guest_l1e_t *) gl2e, - gl2mfn, 0, 2, ft); - - *sl2p = shadow_l2e_from_mfn(sl1mfn, sflags); - - SHADOW2_DEBUG(PROPAGATE, - "%s gl2e=%" SH2_PRI_gpte " sl2e=%" SH2_PRI_pte "\n", - fetch_type_names[ft], gl2e->l2, sl2p->l2); - ASSERT(sflags != -1); -} - -static inline int -l1e_read_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p, - int mmio) -/* returns 1 if emulation is required, and 0 otherwise */ -{ - struct domain *d = v->domain; - u32 gflags = guest_l1e_get_flags(gw->eff_l1e); - u32 sflags = sh2_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn, - mmio, 1, ft_demand_read); - - if ( shadow2_mode_trap_reads(d) && !mmio && sh2_mfn_is_a_page_table(gmfn) ) - { - // emulation required! - *sl1p = shadow_l1e_empty(); - return 1; - } - - *sl1p = shadow_l1e_from_mfn(gmfn, sflags); - - SHADOW2_DEBUG(PROPAGATE, - "va=%p eff_gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n", - (void *)gw->va, gw->eff_l1e.l1, sl1p->l1); - - ASSERT(sflags != -1); - return 0; -} - -static inline int -l1e_write_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p, - int mmio) -/* returns 1 if emulation is required, and 0 otherwise */ -{ - struct domain *d = v->domain; - u32 gflags = guest_l1e_get_flags(gw->eff_l1e); - u32 sflags = sh2_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn, - mmio, 1, ft_demand_write); - - sh2_mark_dirty(d, gmfn); - - if ( !mmio && sh2_mfn_is_a_page_table(gmfn) ) - { - // emulation required! - *sl1p = shadow_l1e_empty(); - return 1; - } - - *sl1p = shadow_l1e_from_mfn(gmfn, sflags); - - SHADOW2_DEBUG(PROPAGATE, - "va=%p eff_gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n", - (void *)gw->va, gw->eff_l1e.l1, sl1p->l1); - - ASSERT(sflags != -1); - return 0; -} - -static inline void -l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, shadow_l1e_t *sl1p, - int mmio) -{ - gfn_t gfn = guest_l1e_get_gfn(gl1e); - mfn_t gmfn = (mmio) ? _mfn(gfn_x(gfn)) : vcpu_gfn_to_mfn(v, gfn); - u32 gflags = guest_l1e_get_flags(gl1e); - u32 sflags = sh2_propagate_flags(v, gmfn, gflags, 0, _mfn(INVALID_MFN), - mmio, 1, ft_prefetch); - - *sl1p = shadow_l1e_from_mfn(gmfn, sflags); - - SHADOW2_DEBUG(PROPAGATE, - "gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n", - gl1e.l1, sl1p->l1); - - ASSERT(sflags != -1); -} - - -/**************************************************************************/ -/* These functions update shadow entries (and do bookkeeping on the shadow - * tables they are in). It is intended that they are the only - * functions which ever write (non-zero) data onto a shadow page. - * - * They return a set of flags: - * SHADOW2_SET_CHANGED -- we actually wrote a new value to the shadow. - * SHADOW2_SET_FLUSH -- the caller must cause a TLB flush. - * SHADOW2_SET_ERROR -- the input is not a valid entry (for example, if - * shadow2_get_page_from_l1e() fails). - * SHADOW2_SET_L3PAE_RECOPY -- one or more vcpu's need to have their local - * copies of their PAE L3 entries re-copied. - */ - -static inline void safe_write_entry(void *dst, void *src) -/* Copy one PTE safely when processors might be running on the - * destination pagetable. This does *not* give safety against - * concurrent writes (that's what the shadow lock is for), just - * stops the hardware picking up partially written entries. */ -{ - volatile unsigned long *d = dst; - unsigned long *s = src; - ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1))); -#if CONFIG_PAGING_LEVELS == 3 - /* In PAE mode, pagetable entries are larger - * than machine words, so won't get written atomically. We need to make - * sure any other cpu running on these shadows doesn't see a - * half-written entry. Do this by marking the entry not-present first, - * then writing the high word before the low word. */ - BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long)); - d[0] = 0; - d[1] = s[1]; - d[0] = s[0]; -#else - /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word, - * which will be an atomic write, since the entry is aligned. */ - BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long)); - *d = *s; -#endif -} - - -static inline void -shadow_write_entries(void *d, void *s, int entries, mfn_t mfn) -/* This function does the actual writes to shadow pages. - * It must not be called directly, since it doesn't do the bookkeeping - * that shadow_set_l*e() functions do. */ -{ - shadow_l1e_t *dst = d; - shadow_l1e_t *src = s; - void *map = NULL; - int i; - - /* Because we mirror access rights at all levels in the shadow, an - * l2 (or higher) entry with the RW bit cleared will leave us with - * no write access through the linear map. - * We detect that by writing to the shadow with copy_to_user() and - * using map_domain_page() to get a writeable mapping if we need to. */ - if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 ) - { - perfc_incrc(shadow2_linear_map_failed); - map = sh2_map_domain_page(mfn); - ASSERT(map != NULL); - dst = map + ((unsigned long)dst & (PAGE_SIZE - 1)); - } - - - for ( i = 0; i < entries; i++ ) - safe_write_entry(dst++, src++); - - if ( map != NULL ) sh2_unmap_domain_page(map); - - /* XXX TODO: - * Update min/max field in page_info struct of this mfn */ -} - -static inline int -perms_strictly_increased(u32 old_flags, u32 new_flags) -/* Given the flags of two entries, are the new flags a strict - * increase in rights over the old ones? */ -{ - u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX); - u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX); - /* Flip the NX bit, since it's the only one that decreases rights; - * we calculate as if it were an "X" bit. */ - of ^= _PAGE_NX_BIT; - nf ^= _PAGE_NX_BIT; - /* If the changed bits are all set in the new flags, then rights strictly - * increased between old and new. */ - return ((of | (of ^ nf)) == nf); -} - -static int inline -shadow2_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d) -{ - int res; - mfn_t mfn; - struct domain *owner; - shadow_l1e_t sanitized_sl1e = - shadow_l1e_remove_flags(sl1e, _PAGE_SHADOW_RW | _PAGE_SHADOW_PRESENT); - - //ASSERT(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT); - //ASSERT((shadow_l1e_get_flags(sl1e) & L1_DISALLOW_MASK) == 0); - - if ( !shadow2_mode_refcounts(d) ) - return 1; - - res = get_page_from_l1e(sanitized_sl1e, d); - - // If a privileged domain is attempting to install a map of a page it does - // not own, we let it succeed anyway. - // - if ( unlikely(!res) && - IS_PRIV(d) && - !shadow2_mode_translate(d) && - valid_mfn(mfn = shadow_l1e_get_mfn(sl1e)) && - (owner = page_get_owner(mfn_to_page(mfn))) && - (d != owner) ) - { - res = get_page_from_l1e(sanitized_sl1e, owner); - SHADOW2_PRINTK("privileged domain %d installs map of mfn %05lx " - "which is owned by domain %d: %s\n", - d->domain_id, mfn_x(mfn), owner->domain_id, - res ? "success" : "failed"); - } - - if ( unlikely(!res) ) - { - perfc_incrc(shadow2_get_page_fail); - SHADOW2_PRINTK("failed: l1e=" SH2_PRI_pte "\n"); - } - - return res; -} - -static void inline -shadow2_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d) -{ - if ( !shadow2_mode_refcounts(d) ) - return; - - put_page_from_l1e(sl1e, d); -} - -#if GUEST_PAGING_LEVELS >= 4 -static int shadow_set_l4e(struct vcpu *v, - shadow_l4e_t *sl4e, - shadow_l4e_t new_sl4e, - mfn_t sl4mfn) -{ - int flags = 0; - shadow_l4e_t old_sl4e; - paddr_t paddr; - ASSERT(sl4e != NULL); - old_sl4e = *sl4e; - - if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */ - - paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) - | (((unsigned long)sl4e) & ~PAGE_MASK)); - - if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT ) - { - /* About to install a new reference */ - sh2_get_ref(shadow_l4e_get_mfn(new_sl4e), paddr); - } - - /* Write the new entry */ - shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn); - flags |= SHADOW2_SET_CHANGED; - - if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT ) - { - /* We lost a reference to an old mfn. */ - mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e); - if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e))) - || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e), - shadow_l4e_get_flags(new_sl4e)) ) - { - flags |= SHADOW2_SET_FLUSH; - } - sh2_put_ref(v, osl3mfn, paddr); - } - return flags; -} -#endif /* GUEST_PAGING_LEVELS >= 4 */ - -#if GUEST_PAGING_LEVELS >= 3 -static int shadow_set_l3e(struct vcpu *v, - shadow_l3e_t *sl3e, - shadow_l3e_t new_sl3e, - mfn_t sl3mfn) -{ - int flags = 0; - shadow_l3e_t old_sl3e; - paddr_t paddr; - ASSERT(sl3e != NULL); - old_sl3e = *sl3e; - - if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */ - - paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) - | (((unsigned long)sl3e) & ~PAGE_MASK)); - - if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT ) - { - /* About to install a new reference */ - sh2_get_ref(shadow_l3e_get_mfn(new_sl3e), paddr); - } - - /* Write the new entry */ - shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn); - flags |= SHADOW2_SET_CHANGED; - -#if GUEST_PAGING_LEVELS == 3 - /* We wrote a guest l3e in a PAE pagetable. This table is copied in - * the linear pagetable entries of its l2s, and may also be copied - * to a low memory location to make it fit in CR3. Report that we - * need to resync those copies (we can't wait for the guest to flush - * the TLB because it might be an increase in rights). */ - { - struct vcpu *vcpu; - - struct pae_l3_bookkeeping *info = sl3p_to_info(sl3e); - for_each_vcpu(v->domain, vcpu) - { - if (info->vcpus & (1 << vcpu->vcpu_id)) - { - // Remember that this flip/update needs to occur. - vcpu->arch.shadow2.pae_flip_pending = 1; - flags |= SHADOW2_SET_L3PAE_RECOPY; - } - } - } -#endif - - if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT ) - { - /* We lost a reference to an old mfn. */ - mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e); - if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) || - !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e), - shadow_l3e_get_flags(new_sl3e)) ) - { - flags |= SHADOW2_SET_FLUSH; - } - sh2_put_ref(v, osl2mfn, paddr); - } - return flags; -} -#endif /* GUEST_PAGING_LEVELS >= 3 */ - -static int shadow_set_l2e(struct vcpu *v, - shadow_l2e_t *sl2e, - shadow_l2e_t new_sl2e, - mfn_t sl2mfn) -{ - int flags = 0; - shadow_l2e_t old_sl2e; - paddr_t paddr; - -#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2 - /* In 2-on-3 we work with pairs of l2es pointing at two-page - * shadows. Reference counting and up-pointers track from the first - * page of the shadow to the first l2e, so make sure that we're - * working with those: - * Align the pointer down so it's pointing at the first of the pair */ - sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t))); - /* Align the mfn of the shadow entry too */ - new_sl2e.l2 &= ~(1< 2 - { - shadow_l2e_t pair[2] = { new_sl2e, new_sl2e }; - /* The l1 shadow is two pages long and need to be pointed to by - * two adjacent l1es. The pair have the same flags, but point - * at odd and even MFNs */ - ASSERT(!(pair[0].l2 & (1<domain; - shadow_l1e_t old_sl1e; - ASSERT(sl1e != NULL); - - old_sl1e = *sl1e; - - if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */ - - if ( shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT ) - { - /* About to install a new reference */ - if ( shadow2_mode_refcounts(d) ) { - if ( shadow2_get_page_from_l1e(new_sl1e, d) == 0 ) - { - /* Doesn't look like a pagetable. */ - flags |= SHADOW2_SET_ERROR; - new_sl1e = shadow_l1e_empty(); - } - } - } - - /* Write the new entry */ - shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn); - flags |= SHADOW2_SET_CHANGED; - - if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT ) - { - /* We lost a reference to an old mfn. */ - /* N.B. Unlike higher-level sets, never need an extra flush - * when writing an l1e. Because it points to the same guest frame - * as the guest l1e did, it's the guest's responsibility to - * trigger a flush later. */ - if ( shadow2_mode_refcounts(d) ) - { - shadow2_put_page_from_l1e(old_sl1e, d); - } - } - return flags; -} - - -/**************************************************************************/ -/* These functions take a vcpu and a virtual address, and return a pointer - * to the appropriate level N entry from the shadow tables. - * If the necessary tables are not present in the shadow, they return NULL. */ - -/* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has - * more levels than the guest, the upper levels are always fixed and do not - * reflect any information from the guest, so we do not use these functions - * to access them. */ - -#if GUEST_PAGING_LEVELS >= 4 -static shadow_l4e_t * -shadow_get_l4e(struct vcpu *v, unsigned long va) -{ - /* Reading the top level table is always valid. */ - return sh2_linear_l4_table(v) + shadow_l4_linear_offset(va); -} -#endif /* GUEST_PAGING_LEVELS >= 4 */ - - -#if GUEST_PAGING_LEVELS >= 3 -static shadow_l3e_t * -shadow_get_l3e(struct vcpu *v, unsigned long va) -{ -#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */ - /* Get the l4 */ - shadow_l4e_t *sl4e = shadow_get_l4e(v, va); - ASSERT(sl4e != NULL); - if ( !(shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT) ) - return NULL; - ASSERT(valid_mfn(shadow_l4e_get_mfn(*sl4e))); - /* l4 was present; OK to get the l3 */ - return sh2_linear_l3_table(v) + shadow_l3_linear_offset(va); -#else /* PAE... */ - /* Top level is always mapped */ - ASSERT(v->arch.shadow_vtable); - return ((shadow_l3e_t *)v->arch.shadow_vtable) + shadow_l3_linear_offset(va); -#endif -} -#endif /* GUEST_PAGING_LEVELS >= 3 */ - - -static shadow_l2e_t * -shadow_get_l2e(struct vcpu *v, unsigned long va) -{ -#if GUEST_PAGING_LEVELS >= 3 /* 64bit/PAE... */ - /* Get the l3 */ - shadow_l3e_t *sl3e = shadow_get_l3e(v, va); - if ( sl3e == NULL || !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) - return NULL; - ASSERT(valid_mfn(shadow_l3e_get_mfn(*sl3e))); - /* l3 was present; OK to get the l2 */ -#endif - return sh2_linear_l2_table(v) + shadow_l2_linear_offset(va); -} - - -#if 0 // avoid the compiler warning for now... - -static shadow_l1e_t * -shadow_get_l1e(struct vcpu *v, unsigned long va) -{ - /* Get the l2 */ - shadow_l2e_t *sl2e = shadow_get_l2e(v, va); - if ( sl2e == NULL || !(shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT) ) - return NULL; - ASSERT(valid_mfn(shadow_l2e_get_mfn(*sl2e))); - /* l2 was present; OK to get the l1 */ - return sh2_linear_l1_table(v) + shadow_l1_linear_offset(va); -} - -#endif - - -/**************************************************************************/ -/* Macros to walk pagetables. These take the shadow of a pagetable and - * walk every "interesting" entry. That is, they don't touch Xen mappings, - * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every - * second entry (since pairs of entries are managed together). For multi-page - * shadows they walk all pages. - * - * Arguments are an MFN, the variable to point to each entry, a variable - * to indicate that we are done (we will shortcut to the end of the scan - * when _done != 0), a variable to indicate that we should avoid Xen mappings, - * and the code. - * - * WARNING: These macros have side-effects. They change the values of both - * the pointer and the MFN. */ - -static inline void increment_ptr_to_guest_entry(void *ptr) -{ - if ( ptr ) - { - guest_l1e_t **entry = ptr; - (*entry)++; - } -} - -/* All kinds of l1: touch all entries */ -#define _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ -do { \ - int _i; \ - shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \ - ASSERT((mfn_to_page(_sl1mfn)->count_info & PGC_SH2_type_mask) \ - == PGC_SH2_l1_shadow \ - || (mfn_to_page(_sl1mfn)->count_info & PGC_SH2_type_mask) \ - == PGC_SH2_fl1_shadow); \ - for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \ - { \ - (_sl1e) = _sp + _i; \ - if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \ - {_code} \ - if ( _done ) break; \ - increment_ptr_to_guest_entry(_gl1p); \ - } \ - unmap_shadow_page(_sp); \ -} while (0) - -/* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */ -#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2 -#define SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ -do { \ - int __done = 0; \ - _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \ - ({ (__done = _done); }), _code); \ - _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \ - if ( !__done ) \ - _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \ - ({ (__done = _done); }), _code); \ -} while (0) -#else /* Everything else; l1 shadows are only one page */ -#define SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ - _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) -#endif - - -#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2 - -/* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */ -#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ -do { \ - int _i, _j, __done = 0; \ - ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \ - == PGC_SH2_l2_32_shadow); \ - for ( _j = 0; _j < 4 && !__done; _j++ ) \ - { \ - shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \ - for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \ - if ( (!(_xen)) \ - || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \ - < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \ - { \ - (_sl2e) = _sp + _i; \ - if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ - {_code} \ - if ( (__done = (_done)) ) break; \ - increment_ptr_to_guest_entry(_gl2p); \ - } \ - unmap_shadow_page(_sp); \ - _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \ - } \ -} while (0) - -#elif GUEST_PAGING_LEVELS == 2 - -/* 32-bit on 32-bit: avoid Xen entries */ -#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ -do { \ - int _i; \ - shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \ - ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \ - == PGC_SH2_l2_32_shadow); \ - for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \ - if ( (!(_xen)) \ - || \ - (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \ - { \ - (_sl2e) = _sp + _i; \ - if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ - {_code} \ - if ( _done ) break; \ - increment_ptr_to_guest_entry(_gl2p); \ - } \ - unmap_shadow_page(_sp); \ -} while (0) - -#elif GUEST_PAGING_LEVELS == 3 - -/* PAE: if it's an l2h, don't touch Xen mappings */ -#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ -do { \ - int _i; \ - shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \ - ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \ - == PGC_SH2_l2_pae_shadow \ - || (mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \ - == PGC_SH2_l2h_pae_shadow); \ - for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \ - if ( (!(_xen)) \ - || ((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \ - != PGC_SH2_l2h_pae_shadow) \ - || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \ - < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \ - { \ - (_sl2e) = _sp + _i; \ - if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ - {_code} \ - if ( _done ) break; \ - increment_ptr_to_guest_entry(_gl2p); \ - } \ - unmap_shadow_page(_sp); \ -} while (0) - -#else - -/* 64-bit l2: touch all entries */ -#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ -do { \ - int _i; \ - shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \ - ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \ - == PGC_SH2_l2_64_shadow); \ - for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \ - { \ - (_sl2e) = _sp + _i; \ - if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ - {_code} \ - if ( _done ) break; \ - increment_ptr_to_guest_entry(_gl2p); \ - } \ - unmap_shadow_page(_sp); \ -} while (0) - -#endif /* different kinds of l2 */ - -#if GUEST_PAGING_LEVELS == 3 - -/* PAE l3 subshadow: touch all entries (FOREACH_L2E will find Xen l2es). */ -#define SHADOW2_FOREACH_L3E_SUB(_sl3e, _gl3p, _done, _code) \ -do { \ - int _i; \ - for ( _i = 0; _i < 4; _i++ ) \ - { \ - if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \ - {_code} \ - if ( _done ) break; \ - _sl3e++; \ - increment_ptr_to_guest_entry(_gl3p); \ - } \ -} while (0) - -/* PAE l3 full shadow: call subshadow walk on all valid l3 subshadows */ -#define SHADOW2_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \ -do { \ - int _i, _j, _k, __done = 0; \ - ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH2_type_mask) \ - == PGC_SH2_l3_pae_shadow); \ - /* The subshadows are split, 64 on each page of the shadow */ \ - for ( _j = 0; _j < 2 && !__done; _j++ ) \ - { \ - void *_sp = sh2_map_domain_page(_sl3mfn); \ - for ( _i = 0; _i < 64; _i++ ) \ - { \ - /* Every second 32-byte region is a bookkeeping entry */ \ - _sl3e = (shadow_l3e_t *)(_sp + (64 * _i)); \ - if ( (sl3p_to_info(_sl3e))->refcount > 0 ) \ - SHADOW2_FOREACH_L3E_SUB(_sl3e, _gl3p, \ - ({ __done = (_done); __done; }), \ - _code); \ - else \ - for ( _k = 0 ; _k < 4 ; _k++ ) \ - increment_ptr_to_guest_entry(_gl3p); \ - if ( __done ) break; \ - } \ - sh2_unmap_domain_page(_sp); \ - _sl3mfn = _mfn(mfn_x(_sl3mfn) + 1); \ - } \ -} while (0) - -#elif GUEST_PAGING_LEVELS == 4 - -/* 64-bit l3: touch all entries */ -#define SHADOW2_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \ -do { \ - int _i; \ - shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \ - ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH2_type_mask) \ - == PGC_SH2_l3_64_shadow); \ - for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \ - { \ - (_sl3e) = _sp + _i; \ - if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \ - {_code} \ - if ( _done ) break; \ - increment_ptr_to_guest_entry(_gl3p); \ - } \ - unmap_shadow_page(_sp); \ -} while (0) - -/* 64-bit l4: avoid Xen mappings */ -#define SHADOW2_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code) \ -do { \ - int _i; \ - shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \ - ASSERT((mfn_to_page(_sl4mfn)->count_info & PGC_SH2_type_mask) \ - == PGC_SH2_l4_64_shadow); \ - for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \ - { \ - if ( (!(_xen)) || is_guest_l4_slot(_i) ) \ - { \ - (_sl4e) = _sp + _i; \ - if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \ - {_code} \ - if ( _done ) break; \ - } \ - increment_ptr_to_guest_entry(_gl4p); \ - } \ - unmap_shadow_page(_sp); \ -} while (0) - -#endif - - - -/**************************************************************************/ -/* Functions to install Xen mappings and linear mappings in shadow pages */ - -static mfn_t sh2_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type); - -// XXX -- this function should probably be moved to shadow2-common.c, but that -// probably wants to wait until the shadow types have been moved from -// shadow2-types.h to shadow2-private.h -// -#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4 -void sh2_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn) -{ - struct domain *d = v->domain; - shadow_l4e_t *sl4e; - - sl4e = sh2_map_domain_page(sl4mfn); - ASSERT(sl4e != NULL); - ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t)); - - /* Copy the common Xen mappings from the idle domain */ - memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT], - &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT], - ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t)); - - /* Install the per-domain mappings for this domain */ - sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] = - shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)), - __PAGE_HYPERVISOR); - - /* Linear mapping */ - sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] = - shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR); - sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] = - shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR); - - if ( shadow2_mode_translate(v->domain) ) - { - /* install domain-specific P2M table */ - sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] = - shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table), - __PAGE_HYPERVISOR); - } - - sh2_unmap_domain_page(sl4e); -} -#endif - -#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3 -// For 3-on-3 PV guests, we need to make sure the xen mappings are in -// place, which means that we need to populate the l2h entry in the l3 -// table. - -void sh2_install_xen_entries_in_l2h(struct vcpu *v, - mfn_t sl2hmfn) -{ - struct domain *d = v->domain; - shadow_l2e_t *sl2e; - int i; - - sl2e = sh2_map_domain_page(sl2hmfn); - ASSERT(sl2e != NULL); - ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t)); - - /* Copy the common Xen mappings from the idle domain */ - memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)], - &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT], - L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t)); - - /* Install the per-domain mappings for this domain */ - for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) - sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] = - shadow_l2e_from_mfn( - page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i), - __PAGE_HYPERVISOR); - - /* We don't set up a linear mapping here because we can't until this - * l2h is installed in an l3e. sh2_update_linear_entries() handles - * the linear mappings when the l3 is loaded. */ - - if ( shadow2_mode_translate(d) ) - { - /* Install the domain-specific p2m table */ - l3_pgentry_t *p2m; - ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0); - p2m = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); - for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ ) - { - sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] = - shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])), - __PAGE_HYPERVISOR); - } - sh2_unmap_domain_page(p2m); - } - - sh2_unmap_domain_page(sl2e); -} - -void sh2_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn) -{ - shadow_l3e_t *sl3e; - guest_l3e_t *gl3e = v->arch.guest_vtable; - shadow_l3e_t new_sl3e; - gfn_t l2gfn; - mfn_t l2gmfn, l2smfn; - int r; - - ASSERT(!shadow2_mode_external(v->domain)); - ASSERT(guest_l3e_get_flags(gl3e[3]) & _PAGE_PRESENT); - l2gfn = guest_l3e_get_gfn(gl3e[3]); - l2gmfn = sh2_gfn_to_mfn(v->domain, gfn_x(l2gfn)); - l2smfn = get_shadow_status(v, l2gmfn, PGC_SH2_l2h_shadow); - if ( !valid_mfn(l2smfn) ) - { - l2smfn = sh2_make_shadow(v, l2gmfn, PGC_SH2_l2h_shadow); - } - l3e_propagate_from_guest(v, &gl3e[3], gl3mfn, l2smfn, &new_sl3e, - ft_prefetch); - sl3e = sh2_map_domain_page(sl3mfn); - r = shadow_set_l3e(v, &sl3e[3], new_sl3e, sl3mfn); - sh2_unmap_domain_page(sl3e); -} -#endif - - -#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2 -void sh2_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn) -{ - struct domain *d = v->domain; - shadow_l2e_t *sl2e; - int i; - - sl2e = sh2_map_domain_page(sl2mfn); - ASSERT(sl2e != NULL); - ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t)); - - /* Copy the common Xen mappings from the idle domain */ - memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT], - &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT], - L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t)); - - /* Install the per-domain mappings for this domain */ - for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) - sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] = - shadow_l2e_from_mfn( - page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i), - __PAGE_HYPERVISOR); - - /* Linear mapping */ - sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] = - shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR); - sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] = - shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR); - - if ( shadow2_mode_translate(d) ) - { - /* install domain-specific P2M table */ - sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] = - shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table), - __PAGE_HYPERVISOR); - } - - sh2_unmap_domain_page(sl2e); -} -#endif - - - - - -/**************************************************************************/ -/* Create a shadow of a given guest page. - */ -static mfn_t -sh2_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type) -{ - mfn_t smfn = shadow2_alloc(v->domain, shadow_type, mfn_x(gmfn)); - SHADOW2_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n", - mfn_x(gmfn), shadow_type, mfn_x(smfn)); - - if ( shadow_type != PGC_SH2_guest_root_type ) - /* Lower-level shadow, not yet linked form a higher level */ - mfn_to_page(smfn)->up = 0; - - // Create the Xen mappings... - if ( !shadow2_mode_external(v->domain) ) - { - switch (shadow_type) - { -#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4 - case PGC_SH2_l4_shadow: - sh2_install_xen_entries_in_l4(v, gmfn, smfn); break; -#endif -#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3 - case PGC_SH2_l3_shadow: - sh2_install_xen_entries_in_l3(v, gmfn, smfn); break; - case PGC_SH2_l2h_shadow: - sh2_install_xen_entries_in_l2h(v, smfn); break; -#endif -#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2 - case PGC_SH2_l2_shadow: - sh2_install_xen_entries_in_l2(v, gmfn, smfn); break; -#endif - default: /* Do nothing */ break; - } - } - - shadow2_promote(v, gmfn, shadow_type); - set_shadow2_status(v, gmfn, shadow_type, smfn); - - return smfn; -} - -/* Make a splintered superpage shadow */ -static mfn_t -make_fl1_shadow(struct vcpu *v, gfn_t gfn) -{ - mfn_t smfn = shadow2_alloc(v->domain, PGC_SH2_fl1_shadow, - (unsigned long) gfn_x(gfn)); - - SHADOW2_DEBUG(MAKE_SHADOW, "(%" SH2_PRI_gfn ")=>%" SH2_PRI_mfn "\n", - gfn_x(gfn), mfn_x(smfn)); - - set_fl1_shadow_status(v, gfn, smfn); - return smfn; -} - - -#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS -mfn_t -sh2_make_monitor_table(struct vcpu *v) -{ - - ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0); - -#if CONFIG_PAGING_LEVELS == 4 - { - struct domain *d = v->domain; - mfn_t m4mfn; - m4mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0); - sh2_install_xen_entries_in_l4(v, m4mfn, m4mfn); - /* Remember the level of this table */ - mfn_to_page(m4mfn)->shadow2_flags = 4; -#if SHADOW_PAGING_LEVELS < 4 - // Install a monitor l3 table in slot 0 of the l4 table. - // This is used for shadow linear maps. - { - mfn_t m3mfn; - l4_pgentry_t *l4e; - m3mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0); - mfn_to_page(m3mfn)->shadow2_flags = 3; - l4e = sh2_map_domain_page(m4mfn); - l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR); - sh2_unmap_domain_page(l4e); - } -#endif /* SHADOW_PAGING_LEVELS < 4 */ - return m4mfn; - } - -#elif CONFIG_PAGING_LEVELS == 3 - - { - struct domain *d = v->domain; - mfn_t m3mfn, m2mfn; - l3_pgentry_t *l3e; - l2_pgentry_t *l2e; - int i; - - m3mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0); - /* Remember the level of this table */ - mfn_to_page(m3mfn)->shadow2_flags = 3; - - // Install a monitor l2 table in slot 3 of the l3 table. - // This is used for all Xen entries, including linear maps - m2mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0); - mfn_to_page(m2mfn)->shadow2_flags = 2; - l3e = sh2_map_domain_page(m3mfn); - l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT); - sh2_install_xen_entries_in_l2h(v, m2mfn); - /* Install the monitor's own linear map */ - l2e = sh2_map_domain_page(m2mfn); - for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) - l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] = - (l3e_get_flags(l3e[i]) & _PAGE_PRESENT) - ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR) - : l2e_empty(); - sh2_unmap_domain_page(l2e); - sh2_unmap_domain_page(l3e); - - SHADOW2_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn)); - return m3mfn; - } - -#elif CONFIG_PAGING_LEVELS == 2 - - { - struct domain *d = v->domain; - mfn_t m2mfn; - m2mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0); - sh2_install_xen_entries_in_l2(v, m2mfn, m2mfn); - /* Remember the level of this table */ - mfn_to_page(m2mfn)->shadow2_flags = 2; - return m2mfn; - } - -#else -#error this should not happen -#endif /* CONFIG_PAGING_LEVELS */ -} -#endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */ - -/**************************************************************************/ -/* These functions also take a virtual address and return the level-N - * shadow table mfn and entry, but they create the shadow pagetables if - * they are needed. The "demand" argument is non-zero when handling - * a demand fault (so we know what to do about accessed bits &c). - * If the necessary tables are not present in the guest, they return NULL. */ -#if GUEST_PAGING_LEVELS >= 4 -static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v, - walk_t *gw, - mfn_t *sl4mfn) -{ - /* There is always a shadow of the top level table. Get it. */ - *sl4mfn = pagetable_get_mfn(v->arch.shadow_table); - /* Reading the top level table is always valid. */ - return sh2_linear_l4_table(v) + shadow_l4_linear_offset(gw->va); -} -#endif /* GUEST_PAGING_LEVELS >= 4 */ - - -#if GUEST_PAGING_LEVELS >= 3 -static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v, - walk_t *gw, - mfn_t *sl3mfn, - fetch_type_t ft) -{ -#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */ - mfn_t sl4mfn; - shadow_l4e_t *sl4e; - if ( !valid_mfn(gw->l3mfn) ) return NULL; /* No guest page. */ - /* Get the l4e */ - sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn); - ASSERT(sl4e != NULL); - if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) - { - *sl3mfn = shadow_l4e_get_mfn(*sl4e); - ASSERT(valid_mfn(*sl3mfn)); - } - else - { - int r; - shadow_l4e_t new_sl4e; - /* No l3 shadow installed: find and install it. */ - *sl3mfn = get_shadow_status(v, gw->l3mfn, PGC_SH2_l3_shadow); - if ( !valid_mfn(*sl3mfn) ) - { - /* No l3 shadow of this page exists at all: make one. */ - *sl3mfn = sh2_make_shadow(v, gw->l3mfn, PGC_SH2_l3_shadow); - } - /* Install the new sl3 table in the sl4e */ - l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn, - *sl3mfn, &new_sl4e, ft); - r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn); - ASSERT((r & SHADOW2_SET_FLUSH) == 0); - } - /* Now follow it down a level. Guaranteed to succeed. */ - return sh2_linear_l3_table(v) + shadow_l3_linear_offset(gw->va); -#else /* PAE... */ - /* There is always a shadow of the top level table. Get it. */ - *sl3mfn = pagetable_get_mfn(v->arch.shadow_table); - /* This next line is important: the shadow l3 table is in an 8k - * shadow and we need to return the right mfn of the pair. This call - * will set it for us as a side-effect. */ - (void) shadow_l3_index(sl3mfn, guest_index(gw->l3e)); - ASSERT(v->arch.shadow_vtable); - return ((shadow_l3e_t *)v->arch.shadow_vtable) - + shadow_l3_table_offset(gw->va); -#endif /* GUEST_PAGING_LEVELS >= 4 */ -} -#endif /* GUEST_PAGING_LEVELS >= 3 */ - - -static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v, - walk_t *gw, - mfn_t *sl2mfn, - fetch_type_t ft) -{ -#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64bit... */ - mfn_t sl3mfn = _mfn(INVALID_MFN); - shadow_l3e_t *sl3e; - if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */ - /* Get the l3e */ - sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft); - ASSERT(sl3e != NULL); /* Since we know guest PT is valid this far */ - if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) - { - *sl2mfn = shadow_l3e_get_mfn(*sl3e); - ASSERT(valid_mfn(*sl2mfn)); - } - else - { - int r; - shadow_l3e_t new_sl3e; - /* No l2 shadow installed: find and install it. */ - *sl2mfn = get_shadow_status(v, gw->l2mfn, PGC_SH2_l2_shadow); - if ( !valid_mfn(*sl2mfn) ) - { - /* No l2 shadow of this page exists at all: make one. */ - *sl2mfn = sh2_make_shadow(v, gw->l2mfn, PGC_SH2_l2_shadow); - } - /* Install the new sl2 table in the sl3e */ - l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn, - *sl2mfn, &new_sl3e, ft); - r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn); - ASSERT((r & SHADOW2_SET_FLUSH) == 0); -#if GUEST_PAGING_LEVELS == 3 - /* Need to sync up the linear maps, as we are about to use them */ - ASSERT( r & SHADOW2_SET_L3PAE_RECOPY ); - sh2_pae_recopy(v->domain); -#endif - } - /* Now follow it down a level. Guaranteed to succeed. */ - return sh2_linear_l2_table(v) + shadow_l2_linear_offset(gw->va); -#else /* 32bit... */ - /* There is always a shadow of the top level table. Get it. */ - *sl2mfn = pagetable_get_mfn(v->arch.shadow_table); - /* This next line is important: the guest l2 has a 16k - * shadow, we need to return the right mfn of the four. This - * call will set it for us as a side-effect. */ - (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e)); - /* Reading the top level table is always valid. */ - return sh2_linear_l2_table(v) + shadow_l2_linear_offset(gw->va); -#endif -} - - -static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v, - walk_t *gw, - mfn_t *sl1mfn, - fetch_type_t ft) -{ - mfn_t sl2mfn; - shadow_l2e_t *sl2e; - - /* Get the l2e */ - sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft); - if ( sl2e == NULL ) return NULL; - if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) - { - *sl1mfn = shadow_l2e_get_mfn(*sl2e); - ASSERT(valid_mfn(*sl1mfn)); - } - else - { - shadow_l2e_t new_sl2e; - int r, flags = guest_l2e_get_flags(*gw->l2e); - /* No l1 shadow installed: find and install it. */ - if ( !(flags & _PAGE_PRESENT) ) - return NULL; /* No guest page. */ - if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) ) - { - /* Splintering a superpage */ - gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e); - *sl1mfn = get_fl1_shadow_status(v, l2gfn); - if ( !valid_mfn(*sl1mfn) ) - { - /* No fl1 shadow of this superpage exists at all: make one. */ - *sl1mfn = make_fl1_shadow(v, l2gfn); - } - } - else - { - /* Shadowing an actual guest l1 table */ - if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */ - *sl1mfn = get_shadow_status(v, gw->l1mfn, PGC_SH2_l1_shadow); - if ( !valid_mfn(*sl1mfn) ) - { - /* No l1 shadow of this page exists at all: make one. */ - *sl1mfn = sh2_make_shadow(v, gw->l1mfn, PGC_SH2_l1_shadow); - } - } - /* Install the new sl1 table in the sl2e */ - l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn, - *sl1mfn, &new_sl2e, ft); - r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn); - ASSERT((r & SHADOW2_SET_FLUSH) == 0); - /* This next line is important: in 32-on-PAE and 32-on-64 modes, - * the guest l1 table has an 8k shadow, and we need to return - * the right mfn of the pair. This call will set it for us as a - * side-effect. (In all other cases, it's a no-op and will be - * compiled out.) */ - (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va)); - } - /* Now follow it down a level. Guaranteed to succeed. */ - return sh2_linear_l1_table(v) + shadow_l1_linear_offset(gw->va); -} - - - -/**************************************************************************/ -/* Destructors for shadow tables: - * Unregister the shadow, decrement refcounts of any entries present in it, - * and release the memory. - * - * N.B. These destructors do not clear the contents of the shadows. - * This allows us to delay TLB shootdowns until the page is being reused. - * See shadow2_alloc() and shadow2_free() for how this is handled. - */ - -#if GUEST_PAGING_LEVELS >= 4 -void sh2_destroy_l4_shadow(struct vcpu *v, mfn_t smfn) -{ - shadow_l4e_t *sl4e; - u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask; - mfn_t gmfn, sl4mfn; - int xen_mappings; - - SHADOW2_DEBUG(DESTROY_SHADOW, - "%s(%05lx)\n", __func__, mfn_x(smfn)); - ASSERT(t == PGC_SH2_l4_shadow); - - /* Record that the guest page isn't shadowed any more (in this type) */ - gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); - delete_shadow2_status(v, gmfn, t, smfn); - shadow2_demote(v, gmfn, t); - /* Take this shadow off the list of root shadows */ - list_del_init(&mfn_to_page(smfn)->list); - - /* Decrement refcounts of all the old entries */ - xen_mappings = (!shadow2_mode_external(v->domain)); - sl4mfn = smfn; - SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, { - if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) - { - sh2_put_ref(v, shadow_l4e_get_mfn(*sl4e), - (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) - | ((unsigned long)sl4e & ~PAGE_MASK)); - } - }); - - /* Put the memory back in the pool */ - shadow2_free(v->domain, smfn); -} -#endif - -#if GUEST_PAGING_LEVELS >= 3 -void sh2_destroy_l3_shadow(struct vcpu *v, mfn_t smfn) -{ - shadow_l3e_t *sl3e; - u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask; - mfn_t gmfn, sl3mfn; - - SHADOW2_DEBUG(DESTROY_SHADOW, - "%s(%05lx)\n", __func__, mfn_x(smfn)); - ASSERT(t == PGC_SH2_l3_shadow); - - /* Record that the guest page isn't shadowed any more (in this type) */ - gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); - delete_shadow2_status(v, gmfn, t, smfn); - shadow2_demote(v, gmfn, t); -#if GUEST_PAGING_LEVELS == 3 - /* Take this shadow off the list of root shadows */ - list_del_init(&mfn_to_page(smfn)->list); -#endif - - /* Decrement refcounts of all the old entries */ - sl3mfn = smfn; - SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, 0, { - if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) - sh2_put_ref(v, shadow_l3e_get_mfn(*sl3e), - (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) - | ((unsigned long)sl3e & ~PAGE_MASK)); - }); - - /* Put the memory back in the pool */ - shadow2_free(v->domain, smfn); -} -#endif - - -#if GUEST_PAGING_LEVELS == 3 -static void sh2_destroy_l3_subshadow(struct vcpu *v, - shadow_l3e_t *sl3e) -/* Tear down just a single 4-entry l3 on a 2-page l3 shadow. */ -{ - int i; - ASSERT((unsigned long)sl3e % (4 * sizeof (shadow_l3e_t)) == 0); - for ( i = 0; i < GUEST_L3_PAGETABLE_ENTRIES; i++ ) - if ( shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT ) - sh2_put_ref(v, shadow_l3e_get_mfn(sl3e[i]), - maddr_from_mapped_domain_page(sl3e)); -} -#endif - -#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) -void sh2_unpin_all_l3_subshadows(struct vcpu *v, mfn_t smfn) -/* Walk a full PAE l3 shadow, unpinning all of the subshadows on it */ -{ - int i, j; - struct pae_l3_bookkeeping *bk; - - ASSERT((mfn_to_page(smfn)->count_info & PGC_SH2_type_mask) - == PGC_SH2_l3_pae_shadow); - /* The subshadows are split, 64 on each page of the shadow */ - for ( i = 0; i < 2; i++ ) - { - void *p = sh2_map_domain_page(_mfn(mfn_x(smfn) + i)); - for ( j = 0; j < 64; j++ ) - { - /* Every second 32-byte region is a bookkeeping entry */ - bk = (struct pae_l3_bookkeeping *)(p + (64 * j) + 32); - if ( bk->pinned ) - sh2_unpin_l3_subshadow(v, (shadow_l3e_t *)(p + (64*j)), smfn); - /* Check whether we've just freed the whole shadow */ - if ( (mfn_to_page(smfn)->count_info & PGC_SH2_count_mask) == 0 ) - { - sh2_unmap_domain_page(p); - return; - } - } - sh2_unmap_domain_page(p); - } -} -#endif - -void sh2_destroy_l2_shadow(struct vcpu *v, mfn_t smfn) -{ - shadow_l2e_t *sl2e; - u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask; - mfn_t gmfn, sl2mfn; - int xen_mappings; - - SHADOW2_DEBUG(DESTROY_SHADOW, - "%s(%05lx)\n", __func__, mfn_x(smfn)); - ASSERT(t == PGC_SH2_l2_shadow - || t == PGC_SH2_l2h_pae_shadow); - - /* Record that the guest page isn't shadowed any more (in this type) */ - gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); - delete_shadow2_status(v, gmfn, t, smfn); - shadow2_demote(v, gmfn, t); -#if GUEST_PAGING_LEVELS == 2 - /* Take this shadow off the list of root shadows */ - list_del_init(&mfn_to_page(smfn)->list); -#endif - - /* Decrement refcounts of all the old entries */ - sl2mfn = smfn; - xen_mappings = (!shadow2_mode_external(v->domain) && - ((GUEST_PAGING_LEVELS == 2) || - ((GUEST_PAGING_LEVELS == 3) && - (t == PGC_SH2_l2h_pae_shadow)))); - SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, { - if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) - sh2_put_ref(v, shadow_l2e_get_mfn(*sl2e), - (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT) - | ((unsigned long)sl2e & ~PAGE_MASK)); - }); - - /* Put the memory back in the pool */ - shadow2_free(v->domain, smfn); -} - -void sh2_destroy_l1_shadow(struct vcpu *v, mfn_t smfn) -{ - struct domain *d = v->domain; - shadow_l1e_t *sl1e; - u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask; - - SHADOW2_DEBUG(DESTROY_SHADOW, - "%s(%05lx)\n", __func__, mfn_x(smfn)); - ASSERT(t == PGC_SH2_l1_shadow || t == PGC_SH2_fl1_shadow); - - /* Record that the guest page isn't shadowed any more (in this type) */ - if ( t == PGC_SH2_fl1_shadow ) - { - gfn_t gfn = _gfn(mfn_to_page(smfn)->u.inuse.type_info); - delete_fl1_shadow_status(v, gfn, smfn); - } - else - { - mfn_t gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); - delete_shadow2_status(v, gmfn, t, smfn); - shadow2_demote(v, gmfn, t); - } - - if ( shadow2_mode_refcounts(d) ) - { - /* Decrement refcounts of all the old entries */ - mfn_t sl1mfn = smfn; - SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, 0, { - if ( shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT ) - shadow2_put_page_from_l1e(*sl1e, d); - }); - } - - /* Put the memory back in the pool */ - shadow2_free(v->domain, smfn); -} - -#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS -void sh2_destroy_monitor_table(struct vcpu *v, mfn_t mmfn) -{ - struct domain *d = v->domain; - ASSERT((mfn_to_page(mmfn)->count_info & PGC_SH2_type_mask) - == PGC_SH2_monitor_table); - -#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4) - /* Need to destroy the l3 monitor page in slot 0 too */ - { - l4_pgentry_t *l4e = sh2_map_domain_page(mmfn); - ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT); - shadow2_free(d, _mfn(l4e_get_pfn(l4e[0]))); - sh2_unmap_domain_page(l4e); - } -#elif CONFIG_PAGING_LEVELS == 3 - /* Need to destroy the l2 monitor page in slot 4 too */ - { - l3_pgentry_t *l3e = sh2_map_domain_page(mmfn); - ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT); - shadow2_free(d, _mfn(l3e_get_pfn(l3e[3]))); - sh2_unmap_domain_page(l3e); - } -#endif - - /* Put the memory back in the pool */ - shadow2_free(d, mmfn); -} -#endif - -/**************************************************************************/ -/* Functions to destroy non-Xen mappings in a pagetable hierarchy. - * These are called from common code when we are running out of shadow - * memory, and unpinning all the top-level shadows hasn't worked. - * - * This implementation is pretty crude and slow, but we hope that it won't - * be called very often. */ - -#if GUEST_PAGING_LEVELS == 2 - -void sh2_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn) -{ - shadow_l2e_t *sl2e; - int xen_mappings = !shadow2_mode_external(v->domain); - SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, { - (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn); - }); -} - -#elif GUEST_PAGING_LEVELS == 3 - -void sh2_unhook_pae_mappings(struct vcpu *v, mfn_t sl3mfn) -/* Walk a full PAE l3 shadow, unhooking entries from all the subshadows */ -{ - shadow_l3e_t *sl3e; - SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, 0, { - if ( (shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) { - mfn_t sl2mfn = shadow_l3e_get_mfn(*sl3e); - if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH2_type_mask) - == PGC_SH2_l2h_pae_shadow ) - { - /* High l2: need to pick particular l2es to unhook */ - shadow_l2e_t *sl2e; - SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, 1, { - (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn); - }); - } - else - { - /* Normal l2: can safely unhook the whole l3e */ - (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn); - } - } - }); - /* We've changed PAE L3 entries: must sync up various copies of them */ - sh2_pae_recopy(v->domain); -} - -#elif GUEST_PAGING_LEVELS == 4 - -void sh2_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn) -{ - shadow_l4e_t *sl4e; - int xen_mappings = !shadow2_mode_external(v->domain); - SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, { - (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn); - }); -} - -#endif - -/**************************************************************************/ -/* Internal translation functions. - * These functions require a pointer to the shadow entry that will be updated. - */ - -/* These functions take a new guest entry, translate it to shadow and write - * the shadow entry. - * - * They return the same bitmaps as the shadow_set_lXe() functions. - */ - -#if GUEST_PAGING_LEVELS >= 4 -static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se) -{ - shadow_l4e_t new_sl4e; - guest_l4e_t *new_gl4e = new_ge; - shadow_l4e_t *sl4p = se; - mfn_t sl3mfn = _mfn(INVALID_MFN); - int result = 0; - - perfc_incrc(shadow2_validate_gl4e_calls); - - if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT ) - { - gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e); - mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn); - if ( valid_mfn(gl3mfn) ) - sl3mfn = get_shadow_status(v, gl3mfn, PGC_SH2_l3_shadow); - else - result |= SHADOW2_SET_ERROR; - } - l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN), - sl3mfn, &new_sl4e, ft_prefetch); - result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn); - return result; -} -#endif // GUEST_PAGING_LEVELS >= 4 - -#if GUEST_PAGING_LEVELS >= 3 -static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se) -{ - shadow_l3e_t new_sl3e; - guest_l3e_t *new_gl3e = new_ge; - shadow_l3e_t *sl3p = se; - mfn_t sl2mfn = _mfn(INVALID_MFN); - int result = 0; - - perfc_incrc(shadow2_validate_gl3e_calls); - - if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT ) - { - gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e); - mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn); - if ( valid_mfn(gl2mfn) ) - sl2mfn = get_shadow_status(v, gl2mfn, PGC_SH2_l2_shadow); - else - result |= SHADOW2_SET_ERROR; - } - l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN), - sl2mfn, &new_sl3e, ft_prefetch); - result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn); - -#if GUEST_PAGING_LEVELS == 3 - /* We have changed a PAE l3 entry: need to sync up the possible copies - * of it */ - if ( result & SHADOW2_SET_L3PAE_RECOPY ) - sh2_pae_recopy(v->domain); -#endif - - return result; -} -#endif // GUEST_PAGING_LEVELS >= 3 - -static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se) -{ - shadow_l2e_t new_sl2e; - guest_l2e_t *new_gl2e = new_ge; - shadow_l2e_t *sl2p = se; - mfn_t sl1mfn = _mfn(INVALID_MFN); - int result = 0; - - perfc_incrc(shadow2_validate_gl2e_calls); - - if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT ) - { - gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e); - if ( guest_supports_superpages(v) && - (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) ) - { - // superpage -- need to look up the shadow L1 which holds the - // splitters... - sl1mfn = get_fl1_shadow_status(v, gl1gfn); -#if 0 - // XXX - it's possible that we want to do some kind of prefetch - // for superpage fl1's here, but this is *not* on the demand path, - // so we'll hold off trying that for now... - // - if ( !valid_mfn(sl1mfn) ) - sl1mfn = make_fl1_shadow(v, gl1gfn); -#endif - } - else - { - mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn); - if ( valid_mfn(gl1mfn) ) - sl1mfn = get_shadow_status(v, gl1mfn, PGC_SH2_l1_shadow); - else - result |= SHADOW2_SET_ERROR; - } - } - l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN), - sl1mfn, &new_sl2e, ft_prefetch); - result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn); - - return result; -} - -static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se) -{ - shadow_l1e_t new_sl1e; - guest_l1e_t *new_gl1e = new_ge; - shadow_l1e_t *sl1p = se; - gfn_t gfn; - mfn_t mfn; - int result = 0; - - perfc_incrc(shadow2_validate_gl1e_calls); - - gfn = guest_l1e_get_gfn(*new_gl1e); - mfn = vcpu_gfn_to_mfn(v, gfn); - - l1e_propagate_from_guest(v, *new_gl1e, &new_sl1e, - /* mmio? */ !valid_mfn(mfn)); - - result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn); - return result; -} - - -/**************************************************************************/ -/* Functions which translate and install a the shadows of arbitrary guest - * entries that we have just seen the guest write. */ - - -static inline int -sh2_map_and_validate(struct vcpu *v, mfn_t gmfn, - void *new_gp, u32 size, u32 sh_type, - u32 (*shadow_index)(mfn_t *smfn, u32 idx), - int (*validate_ge)(struct vcpu *v, void *ge, - mfn_t smfn, void *se)) -/* Generic function for mapping and validating. */ -{ - mfn_t smfn, smfn2, map_mfn; - shadow_l1e_t *sl1p; - u32 shadow_idx, guest_idx; - int result = 0; - - /* Align address and size to guest entry boundaries */ - size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1); - new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1)); - size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1); - ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE); - - /* Map the shadow page */ - smfn = get_shadow_status(v, gmfn, sh_type); - ASSERT(valid_mfn(smfn)); /* Otherwise we would not have been called */ - guest_idx = guest_index(new_gp); - map_mfn = smfn; - shadow_idx = shadow_index(&map_mfn, guest_idx); - sl1p = map_shadow_page(map_mfn); - - /* Validate one entry at a time */ - while ( size ) - { - smfn2 = smfn; - guest_idx = guest_index(new_gp); - shadow_idx = shadow_index(&smfn2, guest_idx); - if ( mfn_x(smfn2) != mfn_x(map_mfn) ) - { - /* We have moved to another page of the shadow */ - map_mfn = smfn2; - unmap_shadow_page(sl1p); - sl1p = map_shadow_page(map_mfn); - } - result |= validate_ge(v, - new_gp, - map_mfn, - &sl1p[shadow_idx]); - size -= sizeof(guest_l1e_t); - new_gp += sizeof(guest_l1e_t); - } - unmap_shadow_page(sl1p); - return result; -} - - -int -sh2_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn, - void *new_gl4p, u32 size) -{ -#if GUEST_PAGING_LEVELS >= 4 - return sh2_map_and_validate(v, gl4mfn, new_gl4p, size, - PGC_SH2_l4_shadow, - shadow_l4_index, - validate_gl4e); -#else // ! GUEST_PAGING_LEVELS >= 4 - SHADOW2_PRINTK("called in wrong paging mode!\n"); - BUG(); - return 0; -#endif -} - -int -sh2_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn, - void *new_gl3p, u32 size) -{ -#if GUEST_PAGING_LEVELS >= 3 - return sh2_map_and_validate(v, gl3mfn, new_gl3p, size, - PGC_SH2_l3_shadow, - shadow_l3_index, - validate_gl3e); -#else // ! GUEST_PAGING_LEVELS >= 3 - SHADOW2_PRINTK("called in wrong paging mode!\n"); - BUG(); - return 0; -#endif -} - -int -sh2_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn, - void *new_gl2p, u32 size) -{ - return sh2_map_and_validate(v, gl2mfn, new_gl2p, size, - PGC_SH2_l2_shadow, - shadow_l2_index, - validate_gl2e); -} - -int -sh2_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn, - void *new_gl2p, u32 size) -{ -#if GUEST_PAGING_LEVELS == 3 - return sh2_map_and_validate(v, gl2mfn, new_gl2p, size, - PGC_SH2_l2h_shadow, - shadow_l2_index, - validate_gl2e); -#else /* Non-PAE guests don't have different kinds of l2 table */ - SHADOW2_PRINTK("called in wrong paging mode!\n"); - BUG(); - return 0; -#endif -} - -int -sh2_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn, - void *new_gl1p, u32 size) -{ - return sh2_map_and_validate(v, gl1mfn, new_gl1p, size, - PGC_SH2_l1_shadow, - shadow_l1_index, - validate_gl1e); -} - - -/**************************************************************************/ -/* Optimization: If we see two emulated writes of zeros to the same - * page-table without another kind of page fault in between, we guess - * that this is a batch of changes (for process destruction) and - * unshadow the page so we don't take a pagefault on every entry. This - * should also make finding writeable mappings of pagetables much - * easier. */ - -/* Look to see if this is the second emulated write in a row to this - * page, and unshadow/unhook if it is */ -static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn) -{ -#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW - if ( v->arch.shadow2.last_emulated_mfn == mfn_x(gmfn) && - sh2_mfn_is_a_page_table(gmfn) ) - { - u32 flags = mfn_to_page(gmfn)->shadow2_flags; - mfn_t smfn; - if ( !(flags & (SH2F_L2_32|SH2F_L3_PAE|SH2F_L4_64)) ) - { - perfc_incrc(shadow2_early_unshadow); - sh2_remove_shadows(v, gmfn, 0 /* Can fail to unshadow */ ); - return; - } - /* SH2F_unhooked_mappings is set to make sure we only unhook - * once in a single batch of updates. It is reset when this - * top-level page is loaded into CR3 again */ - if ( !(flags & SH2F_unhooked_mappings) ) - { - perfc_incrc(shadow2_early_unshadow_top); - mfn_to_page(gmfn)->shadow2_flags |= SH2F_unhooked_mappings; - if ( flags & SH2F_L2_32 ) - { - smfn = get_shadow_status(v, gmfn, PGC_SH2_l2_32_shadow); - shadow2_unhook_mappings(v, smfn); - } - if ( flags & SH2F_L3_PAE ) - { - smfn = get_shadow_status(v, gmfn, PGC_SH2_l3_pae_shadow); - shadow2_unhook_mappings(v, smfn); - } - if ( flags & SH2F_L4_64 ) - { - smfn = get_shadow_status(v, gmfn, PGC_SH2_l4_64_shadow); - shadow2_unhook_mappings(v, smfn); - } - } - } - v->arch.shadow2.last_emulated_mfn = mfn_x(gmfn); -#endif -} - -/* Stop counting towards early unshadows, as we've seen a real page fault */ -static inline void reset_early_unshadow(struct vcpu *v) -{ -#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW - v->arch.shadow2.last_emulated_mfn = INVALID_MFN; -#endif -} - - - -/**************************************************************************/ -/* Entry points into the shadow code */ - -/* Called from pagefault handler in Xen, and from the HVM trap handlers - * for pagefaults. Returns 1 if this fault was an artefact of the - * shadow code (and the guest should retry) or 0 if it is not (and the - * fault should be handled elsewhere or passed to the guest). */ - -static int sh2_page_fault(struct vcpu *v, - unsigned long va, - struct cpu_user_regs *regs) -{ - struct domain *d = v->domain; - walk_t gw; - u32 accumulated_gflags; - gfn_t gfn; - mfn_t gmfn, sl1mfn=_mfn(0); - shadow_l1e_t sl1e, *ptr_sl1e; - paddr_t gpa; - struct cpu_user_regs emul_regs; - struct x86_emulate_ctxt emul_ctxt; - int r, mmio; - fetch_type_t ft = 0; - - // - // XXX: Need to think about eventually mapping superpages directly in the - // shadow (when possible), as opposed to splintering them into a - // bunch of 4K maps. - // - - SHADOW2_PRINTK("d:v=%u:%u va=%#lx err=%u\n", - v->domain->domain_id, v->vcpu_id, va, regs->error_code); - - shadow2_lock(d); - - shadow2_audit_tables(v); - - if ( guest_walk_tables(v, va, &gw, 1) != 0 ) - { - SHADOW2_PRINTK("malformed guest pagetable!"); - print_gw(&gw); - } - - sh2_audit_gw(v, &gw); - - // We do not look at the gw->l1e, as that will not exist for superpages. - // Instead, we use the gw->eff_l1e... - // - // We need not check all the levels of the guest page table entries for - // present vs not-present, as the eff_l1e will always be not present if - // one of the higher level entries is not present. - // - if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) ) - { - if ( hvm_guest(v) && !shadow2_vcpu_mode_translate(v) ) - { - /* Not present in p2m map, means this is mmio */ - gpa = va; - goto mmio; - } - - perfc_incrc(shadow2_fault_bail_not_present); - goto not_a_shadow_fault; - } - - // All levels of the guest page table are now known to be present. - accumulated_gflags = accumulate_guest_flags(&gw); - - // Check for attempts to access supervisor-only pages from user mode, - // i.e. ring 3. Such errors are not caused or dealt with by the shadow - // code. - // - if ( (regs->error_code & PFEC_user_mode) && - !(accumulated_gflags & _PAGE_USER) ) - { - /* illegal user-mode access to supervisor-only page */ - perfc_incrc(shadow2_fault_bail_user_supervisor); - goto not_a_shadow_fault; - } - - // Was it a write fault? - // - if ( regs->error_code & PFEC_write_access ) - { - if ( unlikely(!(accumulated_gflags & _PAGE_RW)) ) - { - perfc_incrc(shadow2_fault_bail_ro_mapping); - goto not_a_shadow_fault; - } - } - else // must have been either an insn fetch or read fault - { - // Check for NX bit violations: attempts to execute code that is - // marked "do not execute". Such errors are not caused or dealt with - // by the shadow code. - // - if ( regs->error_code & PFEC_insn_fetch ) - { - if ( accumulated_gflags & _PAGE_NX_BIT ) - { - /* NX prevented this code fetch */ - perfc_incrc(shadow2_fault_bail_nx); - goto not_a_shadow_fault; - } - } - } - - /* Is this an MMIO access? */ - gfn = guest_l1e_get_gfn(gw.eff_l1e); - mmio = ( hvm_guest(v) - && shadow2_vcpu_mode_translate(v) - && mmio_space(gfn_to_paddr(gfn)) ); - - /* For MMIO, the shadow holds the *gfn*; for normal accesses, if holds - * the equivalent mfn. */ - if ( mmio ) - gmfn = _mfn(gfn_x(gfn)); - else - { - gmfn = vcpu_gfn_to_mfn(v, gfn); - if ( !valid_mfn(gmfn) ) - { - perfc_incrc(shadow2_fault_bail_bad_gfn); - SHADOW2_PRINTK("BAD gfn=%"SH2_PRI_gfn" gmfn=%"SH2_PRI_mfn"\n", - gfn_x(gfn), mfn_x(gmfn)); - goto not_a_shadow_fault; - } - } - - /* Make sure there is enough free shadow memory to build a chain of - * shadow tables: one SHADOW2_MAX_ORDER chunk will always be enough - * to allocate all we need. (We never allocate a top-level shadow - * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */ - shadow2_prealloc(d, SHADOW2_MAX_ORDER); - - /* Acquire the shadow. This must happen before we figure out the rights - * for the shadow entry, since we might promote a page here. */ - // XXX -- this code will need to change somewhat if/when the shadow code - // can directly map superpages... - ft = ((regs->error_code & PFEC_write_access) ? - ft_demand_write : ft_demand_read); - ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft); - ASSERT(ptr_sl1e); - - /* Calculate the shadow entry */ - if ( ft == ft_demand_write ) - { - if ( l1e_write_fault(v, &gw, gmfn, &sl1e, mmio) ) - { - perfc_incrc(shadow2_fault_emulate_write); - goto emulate; - } - } - else if ( l1e_read_fault(v, &gw, gmfn, &sl1e, mmio) ) - { - perfc_incrc(shadow2_fault_emulate_read); - goto emulate; - } - - /* Quick sanity check: we never make an MMIO entry that's got the - * _PAGE_PRESENT flag set in it. */ - ASSERT(!mmio || !(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT)); - - r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn); - - if ( mmio ) - { - gpa = guest_walk_to_gpa(&gw); - goto mmio; - } - -#if 0 - if ( !(r & SHADOW2_SET_CHANGED) ) - debugtrace_printk("%s: shadow_set_l1e(va=%p, sl1e=%" SH2_PRI_pte - ") did not change anything\n", - __func__, gw.va, l1e_get_intpte(sl1e)); -#endif - - perfc_incrc(shadow2_fault_fixed); - d->arch.shadow2.fault_count++; - reset_early_unshadow(v); - - done: - sh2_audit_gw(v, &gw); - unmap_walk(v, &gw); - SHADOW2_PRINTK("fixed\n"); - shadow2_audit_tables(v); - shadow2_unlock(d); - return EXCRET_fault_fixed; - - emulate: - - /* Take the register set we were called with */ - emul_regs = *regs; - if ( hvm_guest(v) ) - { - /* Add the guest's segment selectors, rip, rsp. rflags */ - hvm_store_cpu_guest_regs(v, &emul_regs, NULL); - } - emul_ctxt.regs = &emul_regs; - emul_ctxt.cr2 = va; - emul_ctxt.mode = hvm_guest(v) ? hvm_guest_x86_mode(v) : X86EMUL_MODE_HOST; - - SHADOW2_PRINTK("emulate: eip=%#lx\n", emul_regs.eip); - - v->arch.shadow2.propagate_fault = 0; - if ( x86_emulate_memop(&emul_ctxt, &shadow2_emulator_ops) ) - { - SHADOW2_PRINTK("emulator failure, unshadowing mfn %#lx\n", - mfn_x(gmfn)); - perfc_incrc(shadow2_fault_emulate_failed); - /* If this is actually a page table, then we have a bug, and need - * to support more operations in the emulator. More likely, - * though, this is a hint that this page should not be shadowed. */ - shadow2_remove_all_shadows(v, gmfn); - /* This means that actual missing operations will cause the - * guest to loop on the same page fault. */ - goto done; - } - if ( v->arch.shadow2.propagate_fault ) - { - /* Emulation triggered another page fault */ - goto not_a_shadow_fault; - } - - /* Emulator has changed the user registers: write back */ - if ( hvm_guest(v) ) - { - /* Write back the guest's segment selectors, rip, rsp. rflags */ - hvm_load_cpu_guest_regs(v, &emul_regs); - /* And don't overwrite those in the caller's regs. */ - emul_regs.eip = regs->eip; - emul_regs.cs = regs->cs; - emul_regs.eflags = regs->eflags; - emul_regs.esp = regs->esp; - emul_regs.ss = regs->ss; - emul_regs.es = regs->es; - emul_regs.ds = regs->ds; - emul_regs.fs = regs->fs; - emul_regs.gs = regs->gs; - } - *regs = emul_regs; - - goto done; - - mmio: - perfc_incrc(shadow2_fault_mmio); - if ( !hvm_apic_support(d) && (gpa >= 0xFEC00000) ) - { - /* Need to deal with these disabled-APIC accesses, as - * handle_mmio() apparently does not currently do that. */ - /* TJD: What about it, then? For now, I'm turning this BUG() - * into a domain_crash() since we don't want to kill Xen. */ - SHADOW2_ERROR("disabled-APIC access: not supported\n."); - domain_crash(d); - } - sh2_audit_gw(v, &gw); - unmap_walk(v, &gw); - SHADOW2_PRINTK("mmio\n"); - shadow2_audit_tables(v); - reset_early_unshadow(v); - shadow2_unlock(d); - sh2_log_mmio(v, gpa); - handle_mmio(va, gpa); - return EXCRET_fault_fixed; - - not_a_shadow_fault: - sh2_audit_gw(v, &gw); - unmap_walk(v, &gw); - SHADOW2_PRINTK("not a shadow fault\n"); - shadow2_audit_tables(v); - reset_early_unshadow(v); - shadow2_unlock(d); - return 0; -} - - -static int -sh2_invlpg(struct vcpu *v, unsigned long va) -/* Called when the guest requests an invlpg. Returns 1 if the invlpg - * instruction should be issued on the hardware, or 0 if it's safe not - * to do so. */ -{ - shadow_l2e_t *ptr_sl2e = shadow_get_l2e(v, va); - - // XXX -- might be a good thing to prefetch the va into the shadow - - // no need to flush anything if there's no SL2... - // - if ( !ptr_sl2e ) - return 0; - - // If there's nothing shadowed for this particular sl2e, then - // there is no need to do an invlpg, either... - // - if ( !(shadow_l2e_get_flags(*ptr_sl2e) & _PAGE_PRESENT) ) - return 0; - - // Check to see if the SL2 is a splintered superpage... - // If so, then we'll need to flush the entire TLB (because that's - // easier than invalidating all of the individual 4K pages). - // - if ( (mfn_to_page(shadow_l2e_get_mfn(*ptr_sl2e))->count_info & - PGC_SH2_type_mask) == PGC_SH2_fl1_shadow ) - { - local_flush_tlb(); - return 0; - } - - return 1; -} - -static unsigned long -sh2_gva_to_gfn(struct vcpu *v, unsigned long va) -/* Called to translate a guest virtual address to what the *guest* - * pagetables would map it to. */ -{ - walk_t gw; - gfn_t gfn; - - guest_walk_tables(v, va, &gw, 0); - gfn = guest_walk_to_gfn(&gw); - unmap_walk(v, &gw); - - return gfn_x(gfn); -} - - -static unsigned long -sh2_gva_to_gpa(struct vcpu *v, unsigned long va) -/* Called to translate a guest virtual address to what the *guest* - * pagetables would map it to. */ -{ - unsigned long gfn = sh2_gva_to_gfn(v, va); - if ( gfn == INVALID_GFN ) - return 0; - else - return (gfn << PAGE_SHIFT) | (va & ~PAGE_MASK); -} - - -// XXX -- should this be in this file? -// Or should it be moved to shadow2-common.c? -// -/* returns a lowmem machine address of the copied HVM L3 root table - * If clear_res != 0, then clear the PAE-l3 reserved bits in the copy, - * otherwise blank out any entries with reserved bits in them. */ -#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) -static unsigned long -hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res) -{ - int i, f; - int res = (_PAGE_RW|_PAGE_NX_BIT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY); - l3_pgentry_t new_l3e, *copy = v->arch.hvm_vcpu.hvm_lowmem_l3tab; - memcpy(copy, l3tab, 4 * sizeof(l3_pgentry_t)); - for ( i = 0; i < 4; i++ ) - { - f = l3e_get_flags(l3tab[i]); - if ( (f & _PAGE_PRESENT) && (!(f & res) || clear_res) ) - new_l3e = l3e_from_pfn(l3e_get_pfn(l3tab[i]), f & ~res); - else - new_l3e = l3e_empty(); - safe_write_entry(©[i], &new_l3e); - } - return __pa(copy); -} -#endif - - -static inline void -sh2_update_linear_entries(struct vcpu *v) -/* Sync up all the linear mappings for this vcpu's pagetables */ -{ - struct domain *d = v->domain; - - /* Linear pagetables in PV guests - * ------------------------------ - * - * Guest linear pagetables, which map the guest pages, are at - * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the - * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these - * are set up at shadow creation time, but (of course!) the PAE case - * is subtler. Normal linear mappings are made by having an entry - * in the top-level table that points to itself (shadow linear) or - * to the guest top-level table (guest linear). For PAE, to set up - * a linear map requires us to copy the four top-level entries into - * level-2 entries. That means that every time we change a PAE l3e, - * we need to reflect the change into the copy. - * - * Linear pagetables in HVM guests - * ------------------------------- - * - * For HVM guests, the linear pagetables are installed in the monitor - * tables (since we can't put them in the shadow). Shadow linear - * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START, - * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for - * a linear pagetable of the monitor tables themselves. We have - * the same issue of having to re-copy PAE l3 entries whevever we use - * PAE shadows. - * - * Because HVM guests run on the same monitor tables regardless of the - * shadow tables in use, the linear mapping of the shadow tables has to - * be updated every time v->arch.shadow_table changes. - */ - - /* Don't try to update the monitor table if it doesn't exist */ - if ( shadow2_mode_external(d) - && pagetable_get_pfn(v->arch.monitor_table) == 0 ) - return; - -#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4) - - /* For PV, one l4e points at the guest l4, one points at the shadow - * l4. No maintenance required. - * For HVM, just need to update the l4e that points to the shadow l4. */ - - if ( shadow2_mode_external(d) ) - { - /* Use the linear map if we can; otherwise make a new mapping */ - if ( v == current ) - { - __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] = - l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), - __PAGE_HYPERVISOR); - } - else - { - l4_pgentry_t *ml4e; - ml4e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); - ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] = - l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), - __PAGE_HYPERVISOR); - sh2_unmap_domain_page(ml4e); - } - } - -#elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3) - - /* This case only exists in HVM. To give ourselves a linear map of the - * shadows, we need to extend a PAE shadow to 4 levels. We do this by - * having a monitor l3 in slot 0 of the monitor l4 table, and - * copying the PAE l3 entries into it. Then, by having the monitor l4e - * for shadow pagetables also point to the monitor l4, we can use it - * to access the shadows. */ - - if ( shadow2_mode_external(d) ) - { - /* Install copies of the shadow l3es into the monitor l3 table. - * The monitor l3 table is hooked into slot 0 of the monitor - * l4 table, so we use l3 linear indices 0 to 3 */ - shadow_l3e_t *sl3e; - l3_pgentry_t *ml3e; - mfn_t l3mfn; - int i; - - /* Use linear mappings if we can; otherwise make new mappings */ - if ( v == current ) - { - ml3e = __linear_l3_table; - l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0])); -#if GUEST_PAGING_LEVELS == 2 - /* Shadow l3 tables are made up by update_cr3 */ - sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab; -#else - sl3e = v->arch.shadow_vtable; -#endif - } - else - { - l4_pgentry_t *ml4e; - ml4e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); - ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT); - l3mfn = _mfn(l4e_get_pfn(ml4e[0])); - ml3e = sh2_map_domain_page(l3mfn); - sh2_unmap_domain_page(ml4e); -#if GUEST_PAGING_LEVELS == 2 - /* Shadow l3 tables are made up by update_cr3 */ - sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab; -#else - sl3e = sh2_map_domain_page(pagetable_get_mfn(v->arch.shadow_table)); -#endif - } - - for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) - { - ml3e[i] = - (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT) - ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])), - __PAGE_HYPERVISOR) - : l3e_empty(); - } - - if ( v != current ) - { - sh2_unmap_domain_page(ml3e); -#if GUEST_PAGING_LEVELS != 2 - sh2_unmap_domain_page(sl3e); -#endif - } - } - -#elif CONFIG_PAGING_LEVELS == 3 - - /* PV: need to copy the guest's l3 entries into the guest-linear-map l2 - * entries in the shadow, and the shadow's l3 entries into the - * shadow-linear-map l2 entries in the shadow. This is safe to do - * because Xen does not let guests share high-slot l2 tables between l3s, - * so we know we're not treading on anyone's toes. - * - * HVM: need to copy the shadow's l3 entries into the - * shadow-linear-map l2 entries in the monitor table. This is safe - * because we have one monitor table for each vcpu. The monitor's - * own l3es don't need to be copied because they never change. - * XXX That might change if we start stuffing things into the rest - * of the monitor's virtual address space. - */ - { - l2_pgentry_t *l2e, new_l2e; - shadow_l3e_t *guest_l3e = NULL, *shadow_l3e; - int i; - -#if GUEST_PAGING_LEVELS == 2 - /* Shadow l3 tables were built by update_cr3 */ - if ( shadow2_mode_external(d) ) - shadow_l3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab; - else - BUG(); /* PV 2-on-3 is not supported yet */ - -#else /* GUEST_PAGING_LEVELS == 3 */ - - /* Use local vcpu's mappings if we can; otherwise make new mappings */ - if ( v == current ) - { - shadow_l3e = v->arch.shadow_vtable; - if ( !shadow2_mode_external(d) ) - guest_l3e = v->arch.guest_vtable; - } - else - { - mfn_t smfn; - int idx; - - /* Map the shadow l3 */ - smfn = pagetable_get_mfn(v->arch.shadow_table); - idx = shadow_l3_index(&smfn, guest_index(v->arch.shadow_vtable)); - shadow_l3e = sh2_map_domain_page(smfn); - shadow_l3e += idx; - if ( !shadow2_mode_external(d) ) - { - /* Also the guest l3 */ - mfn_t gmfn = pagetable_get_mfn(v->arch.guest_table); - guest_l3e = sh2_map_domain_page(gmfn); - guest_l3e += guest_index(v->arch.guest_vtable); - } - } -#endif /* GUEST_PAGING_LEVELS */ - - /* Choose where to write the entries, using linear maps if possible */ - if ( v == current && shadow2_mode_external(d) ) - { - /* From the monitor tables, it's safe to use linear maps to update - * monitor l2s */ - l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES); - } - else if ( shadow2_mode_external(d) ) - { - /* Map the monitor table's high l2 */ - l3_pgentry_t *l3e; - l3e = sh2_map_domain_page( - pagetable_get_mfn(v->arch.monitor_table)); - ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT); - l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[3]))); - sh2_unmap_domain_page(l3e); - } - else - { - /* Map the shadow table's high l2 */ - ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT); - l2e = sh2_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3])); - } - - - if ( !shadow2_mode_external(d) ) - { - /* Write linear mapping of guest. */ - for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) - { - new_l2e = (shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT) - ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])), - __PAGE_HYPERVISOR) - : l2e_empty(); - safe_write_entry( - &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], - &new_l2e); - } - } - - /* Write linear mapping of shadow. */ - for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) - { - new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT) - ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])), - __PAGE_HYPERVISOR) - : l2e_empty(); - safe_write_entry( - &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i], - &new_l2e); - } - - if ( v != current || !shadow2_mode_external(d) ) - sh2_unmap_domain_page(l2e); - -#if GUEST_PAGING_LEVELS == 3 - if ( v != current) - { - sh2_unmap_domain_page(shadow_l3e); - if ( !shadow2_mode_external(d) ) - sh2_unmap_domain_page(guest_l3e); - } -#endif - } - -#elif CONFIG_PAGING_LEVELS == 2 - - /* For PV, one l2e points at the guest l2, one points at the shadow - * l2. No maintenance required. - * For HVM, just need to update the l2e that points to the shadow l2. */ - - if ( shadow2_mode_external(d) ) - { - /* Use the linear map if we can; otherwise make a new mapping */ - if ( v == current ) - { - __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] = - l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), - __PAGE_HYPERVISOR); - } - else - { - l2_pgentry_t *ml2e; - ml2e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); - ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = - l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), - __PAGE_HYPERVISOR); - sh2_unmap_domain_page(ml2e); - } - } - -#else -#error this should not happen -#endif -} - - -// XXX -- should this be in this file? -// Or should it be moved to shadow2-common.c? -// -#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) -void sh2_pae_recopy(struct domain *d) -/* Called whenever we write to the l3 entries of a PAE pagetable which - * is currently in use. Each vcpu that is using the table needs to - * resync its copies of the l3s in linear maps and any low-memory - * copies it might have made for fitting into 32bit CR3. - * Since linear maps are also resynced when we change CR3, we don't - * need to worry about changes to PAE l3es that are not currently in use.*/ -{ - struct vcpu *v; - cpumask_t flush_mask = CPU_MASK_NONE; - ASSERT(shadow2_lock_is_acquired(d)); - - for_each_vcpu(d, v) - { - if ( !v->arch.shadow2.pae_flip_pending ) - continue; - - cpu_set(v->processor, flush_mask); - - SHADOW2_PRINTK("d=%u v=%u\n", v->domain->domain_id, v->vcpu_id); - - /* This vcpu has a copy in its linear maps */ - sh2_update_linear_entries(v); - if ( hvm_guest(v) ) - { - /* This vcpu has a copy in its HVM PAE l3 */ - v->arch.hvm_vcpu.hw_cr3 = - hvm_pae_copy_root(v, v->arch.shadow_vtable, - !shadow2_vcpu_mode_translate(v)); - } -#if CONFIG_PAGING_LEVELS == 3 - else - { - /* This vcpu might have copied the l3 to below 4GB */ - if ( v->arch.cr3 >> PAGE_SHIFT - != pagetable_get_pfn(v->arch.shadow_table) ) - { - /* Recopy to where that copy is. */ - int i; - l3_pgentry_t *dst, *src; - dst = __va(v->arch.cr3 & ~0x1f); /* Mask cache control bits */ - src = v->arch.shadow_vtable; - for ( i = 0 ; i < 4 ; i++ ) - safe_write_entry(dst + i, src + i); - } - } -#endif - v->arch.shadow2.pae_flip_pending = 0; - } - - flush_tlb_mask(flush_mask); -} -#endif /* (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) */ - - -/* removes: - * vcpu->arch.guest_vtable - * vcpu->arch.shadow_table - * vcpu->arch.shadow_vtable - * Does all appropriate management/bookkeeping/refcounting/etc... - */ -static void -sh2_detach_old_tables(struct vcpu *v) -{ - mfn_t smfn; - - //// - //// vcpu->arch.guest_vtable - //// - if ( (shadow2_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) && - v->arch.guest_vtable ) - { - // Q: why does this need to use (un)map_domain_page_*global* ? - sh2_unmap_domain_page_global(v->arch.guest_vtable); - v->arch.guest_vtable = NULL; - } - - //// - //// vcpu->arch.shadow_table - //// - smfn = pagetable_get_mfn(v->arch.shadow_table); - if ( mfn_x(smfn) ) - { - ASSERT(v->arch.shadow_vtable); - -#if GUEST_PAGING_LEVELS == 3 - // PAE guests do not (necessarily) use an entire page for their - // 4-entry L3s, so we have to deal with them specially. - // - sh2_put_ref_l3_subshadow(v, v->arch.shadow_vtable, smfn); -#else - sh2_put_ref(v, smfn, 0); -#endif - -#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3) - { - struct pae_l3_bookkeeping *info = - sl3p_to_info(v->arch.shadow_vtable); - ASSERT(test_bit(v->vcpu_id, &info->vcpus)); - clear_bit(v->vcpu_id, &info->vcpus); - } -#endif - v->arch.shadow_table = pagetable_null(); - } - - //// - //// vcpu->arch.shadow_vtable - //// - if ( (shadow2_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) && - v->arch.shadow_vtable ) - { - // Q: why does this need to use (un)map_domain_page_*global* ? - // - sh2_unmap_domain_page_global(v->arch.shadow_vtable); - v->arch.shadow_vtable = NULL; - } -} - -static void -sh2_update_cr3(struct vcpu *v) -/* Updates vcpu->arch.shadow_table after the guest has changed CR3. - * Paravirtual guests should set v->arch.guest_table (and guest_table_user, - * if appropriate). - * HVM guests should also set hvm_get_guest_cntl_reg(v, 3)... - */ -{ - struct domain *d = v->domain; - mfn_t gmfn, smfn; -#if GUEST_PAGING_LEVELS == 3 - u32 guest_idx=0; -#endif - - ASSERT(shadow2_lock_is_acquired(v->domain)); - ASSERT(v->arch.shadow2.mode); - - //// - //// vcpu->arch.guest_table is already set - //// - -#ifndef NDEBUG - /* Double-check that the HVM code has sent us a sane guest_table */ - if ( hvm_guest(v) ) - { - gfn_t gfn; - - ASSERT(shadow2_mode_external(d)); - - // Is paging enabled on this vcpu? - if ( shadow2_vcpu_mode_translate(v) ) - { - gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3))); - gmfn = vcpu_gfn_to_mfn(v, gfn); - ASSERT(valid_mfn(gmfn)); - ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn)); - } - else - { - /* Paging disabled: guest_table points at (part of) p2m */ -#if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */ - /* For everything else, they sould be the same */ - ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn); -#endif - } - } -#endif - - SHADOW2_PRINTK("d=%u v=%u guest_table=%05lx\n", - d->domain_id, v->vcpu_id, - (unsigned long)pagetable_get_pfn(v->arch.guest_table)); - -#if GUEST_PAGING_LEVELS == 4 - if ( !(v->arch.flags & TF_kernel_mode) ) - gmfn = pagetable_get_mfn(v->arch.guest_table_user); - else -#endif - gmfn = pagetable_get_mfn(v->arch.guest_table); - - sh2_detach_old_tables(v); - - if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) ) - { - ASSERT(v->arch.cr3 == 0); - return; - } - - //// - //// vcpu->arch.guest_vtable - //// - if ( shadow2_mode_external(d) ) - { -#if GUEST_PAGING_LEVELS == 3 - if ( shadow2_vcpu_mode_translate(v) ) - /* Paging enabled: find where in the page the l3 table is */ - guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3)); - else - /* Paging disabled: l3 is at the start of a page (in the p2m) */ - guest_idx = 0; - - // Ignore the low 2 bits of guest_idx -- they are really just - // cache control. - guest_idx &= ~3; - // XXX - why does this need a global map? - v->arch.guest_vtable = - (guest_l3e_t *)sh2_map_domain_page_global(gmfn) + guest_idx; -#else - // XXX - why does this need a global map? - v->arch.guest_vtable = sh2_map_domain_page_global(gmfn); -#endif - } - else - { -#ifdef __x86_64__ - v->arch.guest_vtable = __linear_l4_table; -#elif GUEST_PAGING_LEVELS == 3 - // XXX - why does this need a global map? - v->arch.guest_vtable = sh2_map_domain_page_global(gmfn); -#else - v->arch.guest_vtable = __linear_l2_table; -#endif - } - -#if 0 - printk("%s %s %d gmfn=%05lx guest_vtable=%p\n", - __func__, __FILE__, __LINE__, gmfn, v->arch.guest_vtable); -#endif - - //// - //// vcpu->arch.shadow_table - //// - smfn = get_shadow_status(v, gmfn, PGC_SH2_guest_root_type); - if ( valid_mfn(smfn) ) - { - /* Pull this root shadow to the front of the list of roots. */ - list_del(&mfn_to_page(smfn)->list); - list_add(&mfn_to_page(smfn)->list, &d->arch.shadow2.toplevel_shadows); - } - else - { - /* This guest MFN is a pagetable. Must revoke write access. */ - if ( shadow2_remove_write_access(v, gmfn, GUEST_PAGING_LEVELS, 0) - != 0 ) - flush_tlb_mask(d->domain_dirty_cpumask); - /* Make sure there's enough free shadow memory. */ - shadow2_prealloc(d, SHADOW2_MAX_ORDER); - /* Shadow the page. */ - smfn = sh2_make_shadow(v, gmfn, PGC_SH2_guest_root_type); - list_add(&mfn_to_page(smfn)->list, &d->arch.shadow2.toplevel_shadows); - } - ASSERT(valid_mfn(smfn)); - v->arch.shadow_table = pagetable_from_mfn(smfn); - -#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW - /* Once again OK to unhook entries from this table if we see fork/exit */ - ASSERT(sh2_mfn_is_a_page_table(gmfn)); - mfn_to_page(gmfn)->shadow2_flags &= ~SH2F_unhooked_mappings; -#endif - - - //// - //// vcpu->arch.shadow_vtable - //// - if ( shadow2_mode_external(d) ) - { -#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3) - mfn_t adjusted_smfn = smfn; - u32 shadow_idx = shadow_l3_index(&adjusted_smfn, guest_idx); - // Q: why does this need to use (un)map_domain_page_*global* ? - v->arch.shadow_vtable = - (shadow_l3e_t *)sh2_map_domain_page_global(adjusted_smfn) + - shadow_idx; -#else - // Q: why does this need to use (un)map_domain_page_*global* ? - v->arch.shadow_vtable = sh2_map_domain_page_global(smfn); -#endif - } - else - { -#if SHADOW_PAGING_LEVELS == 4 - v->arch.shadow_vtable = __sh2_linear_l4_table; -#elif GUEST_PAGING_LEVELS == 3 - // XXX - why does this need a global map? - v->arch.shadow_vtable = sh2_map_domain_page_global(smfn); -#else - v->arch.shadow_vtable = __sh2_linear_l2_table; -#endif - } - - //// - //// Take a ref to the new shadow table, and pin it. - //// - // - // This ref is logically "held" by v->arch.shadow_table entry itself. - // Release the old ref. - // -#if GUEST_PAGING_LEVELS == 3 - // PAE guests do not (necessarily) use an entire page for their - // 4-entry L3s, so we have to deal with them specially. - // - // XXX - might want to revisit this if/when we do multiple compilation for - // HVM-vs-PV guests, as PAE PV guests could get away without doing - // subshadows. - // - sh2_get_ref_l3_subshadow(v->arch.shadow_vtable, smfn); - sh2_pin_l3_subshadow(v->arch.shadow_vtable, smfn); -#else - sh2_get_ref(smfn, 0); - sh2_pin(smfn); -#endif - -#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3) - // PAE 3-on-3 shadows have to keep track of which vcpu's are using - // which l3 subshadow, in order handle the SHADOW2_SET_L3PAE_RECOPY - // case from validate_gl3e(). Search for SHADOW2_SET_L3PAE_RECOPY - // in the code for more info. - // - { - struct pae_l3_bookkeeping *info = - sl3p_to_info(v->arch.shadow_vtable); - ASSERT(!test_bit(v->vcpu_id, &info->vcpus)); - set_bit(v->vcpu_id, &info->vcpus); - } -#endif - - debugtrace_printk("%s cr3 gmfn=%05lx smfn=%05lx\n", - __func__, gmfn, smfn); - - /// - /// v->arch.cr3 and, if appropriate, v->arch.hvm_vcpu.hw_cr3 - /// - if ( shadow2_mode_external(d) ) - { - ASSERT(hvm_guest(v)); - make_cr3(v, pagetable_get_pfn(v->arch.monitor_table)); - -#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2) -#if SHADOW_PAGING_LEVELS != 3 -#error unexpected combination of GUEST and SHADOW paging levels -#endif - /* 2-on-3: make a PAE l3 table that points at the four-page l2 */ - { - mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table); - int i; - - ASSERT(v->arch.hvm_vcpu.hw_cr3 == - virt_to_maddr(v->arch.hvm_vcpu.hvm_lowmem_l3tab)); - for (i = 0; i < 4; i++) - { - v->arch.hvm_vcpu.hvm_lowmem_l3tab[i] = - shadow_l3e_from_mfn(_mfn(mfn_x(smfn)+i), _PAGE_PRESENT); - } - } -#elif (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) - /* 3-on-3: copy the shadow l3 to slots that are below 4GB. - * If paging is disabled, clear l3e reserved bits; otherwise - * remove entries that have reserved bits set. */ - v->arch.hvm_vcpu.hw_cr3 = - hvm_pae_copy_root(v, v->arch.shadow_vtable, - !shadow2_vcpu_mode_translate(v)); -#else - /* 2-on-2 or 4-on-4: just put the shadow top-level into cr3 */ - v->arch.hvm_vcpu.hw_cr3 = - pagetable_get_paddr(v->arch.shadow_table); -#endif - } - else // not shadow2_mode_external... - { - /* We don't support PV except guest == shadow == config levels */ - BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS); - make_cr3(v, pagetable_get_pfn(v->arch.shadow_table)); - } - - /* Fix up the linear pagetable mappings */ - sh2_update_linear_entries(v); -} - - -/**************************************************************************/ -/* Functions to revoke guest rights */ - -#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC -static int sh2_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn) -/* Look up this vaddr in the current shadow and see if it's a writeable - * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */ -{ - shadow_l1e_t sl1e, *sl1p; - shadow_l2e_t *sl2p; -#if GUEST_PAGING_LEVELS >= 3 - shadow_l3e_t *sl3p; -#if GUEST_PAGING_LEVELS >= 4 - shadow_l4e_t *sl4p; -#endif -#endif - mfn_t sl1mfn; - - - /* Carefully look in the shadow linear map for the l1e we expect */ - if ( v->arch.shadow_vtable == NULL ) return 0; -#if GUEST_PAGING_LEVELS >= 4 - sl4p = sh2_linear_l4_table(v) + shadow_l4_linear_offset(vaddr); - if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) ) - return 0; - sl3p = sh2_linear_l3_table(v) + shadow_l3_linear_offset(vaddr); - if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) ) - return 0; -#elif GUEST_PAGING_LEVELS == 3 - sl3p = ((shadow_l3e_t *) v->arch.shadow_vtable) - + shadow_l3_linear_offset(vaddr); - if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) ) - return 0; -#endif - sl2p = sh2_linear_l2_table(v) + shadow_l2_linear_offset(vaddr); - if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) ) - return 0; - sl1p = sh2_linear_l1_table(v) + shadow_l1_linear_offset(vaddr); - sl1e = *sl1p; - if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW)) - != (_PAGE_PRESENT|_PAGE_RW)) - || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) ) - return 0; - - /* Found it! Need to remove its write permissions. */ - sl1mfn = shadow_l2e_get_mfn(*sl2p); - sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW); - shadow_set_l1e(v, sl1p, sl1e, sl1mfn); - return 1; -} -#endif - -int sh2_remove_write_access(struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn) -/* Excises all writeable mappings to readonly_mfn from this l1 shadow table */ -{ - shadow_l1e_t *sl1e; - int done = 0; - int flags; - - SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done, - { - flags = shadow_l1e_get_flags(*sl1e); - if ( (flags & _PAGE_PRESENT) - && (flags & _PAGE_RW) - && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) ) - { - shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn); - if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info - & PGT_count_mask) == 0 ) - /* This breaks us cleanly out of the FOREACH macro */ - done = 1; - } - }); - return done; -} - - -int sh2_remove_all_mappings(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn) -/* Excises all mappings to guest frame from this shadow l1 table */ -{ - shadow_l1e_t *sl1e; - int done = 0; - int flags; - - SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done, - { - flags = shadow_l1e_get_flags(*sl1e); - if ( (flags & _PAGE_PRESENT) - && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) ) - { - shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn); - if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 ) - /* This breaks us cleanly out of the FOREACH macro */ - done = 1; - } - }); - return done; -} - -/**************************************************************************/ -/* Functions to excise all pointers to shadows from higher-level shadows. */ - -void sh2_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn) -/* Blank out a single shadow entry */ -{ - switch (mfn_to_page(smfn)->count_info & PGC_SH2_type_mask) - { - case PGC_SH2_l1_shadow: - shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break; - case PGC_SH2_l2_shadow: -#if GUEST_PAGING_LEVELS == 3 - case PGC_SH2_l2h_shadow: -#endif - shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break; -#if GUEST_PAGING_LEVELS >= 3 - case PGC_SH2_l3_shadow: - shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break; -#if GUEST_PAGING_LEVELS >= 4 - case PGC_SH2_l4_shadow: - shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break; -#endif -#endif - default: BUG(); /* Called with the wrong kind of shadow. */ - } -} - -int sh2_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn) -/* Remove all mappings of this l1 shadow from this l2 shadow */ -{ - shadow_l2e_t *sl2e; - int done = 0; - int flags; -#if GUEST_PAGING_LEVELS != 4 - int xen_mappings = !shadow2_mode_external(v->domain); -#endif - - SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, done, xen_mappings, - { - flags = shadow_l2e_get_flags(*sl2e); - if ( (flags & _PAGE_PRESENT) - && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) ) - { - shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn); - if ( (mfn_to_page(sl1mfn)->count_info & PGC_SH2_type_mask) == 0 ) - /* This breaks us cleanly out of the FOREACH macro */ - done = 1; - } - }); - return done; -} - -#if GUEST_PAGING_LEVELS >= 3 -int sh2_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn) -/* Remove all mappings of this l2 shadow from this l3 shadow */ -{ - shadow_l3e_t *sl3e; - int done = 0; - int flags; - - SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, done, - { - flags = shadow_l3e_get_flags(*sl3e); - if ( (flags & _PAGE_PRESENT) - && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) ) - { - shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn); - if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH2_type_mask) == 0 ) - /* This breaks us cleanly out of the FOREACH macro */ - done = 1; - } - }); - return done; -} - -#if GUEST_PAGING_LEVELS >= 4 -int sh2_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn) -/* Remove all mappings of this l3 shadow from this l4 shadow */ -{ - shadow_l4e_t *sl4e; - int done = 0; - int flags, xen_mappings = !shadow2_mode_external(v->domain); - - SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, done, xen_mappings, - { - flags = shadow_l4e_get_flags(*sl4e); - if ( (flags & _PAGE_PRESENT) - && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) ) - { - shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn); - if ( (mfn_to_page(sl3mfn)->count_info & PGC_SH2_type_mask) == 0 ) - /* This breaks us cleanly out of the FOREACH macro */ - done = 1; - } - }); - return done; -} -#endif /* 64bit guest */ -#endif /* PAE guest */ - -/**************************************************************************/ -/* Handling HVM guest writes to pagetables */ - -/* Check that the user is allowed to perform this write. - * Returns a mapped pointer to write to, and the mfn it's on, - * or NULL for error. */ -static inline void * emulate_map_dest(struct vcpu *v, - unsigned long vaddr, - struct x86_emulate_ctxt *ctxt, - mfn_t *mfnp) -{ - walk_t gw; - u32 flags; - gfn_t gfn; - mfn_t mfn; - - guest_walk_tables(v, vaddr, &gw, 1); - flags = accumulate_guest_flags(&gw); - gfn = guest_l1e_get_gfn(gw.eff_l1e); - mfn = vcpu_gfn_to_mfn(v, gfn); - sh2_audit_gw(v, &gw); - unmap_walk(v, &gw); - - if ( !(flags & _PAGE_PRESENT) - || !(flags & _PAGE_RW) - || (!(flags & _PAGE_USER) && ring_3(ctxt->regs)) ) - { - /* This write would have faulted even on bare metal */ - v->arch.shadow2.propagate_fault = 1; - return NULL; - } - - if ( !valid_mfn(mfn) ) - { - /* Attempted a write to a bad gfn. This should never happen: - * after all, we're here because this write is to a page table. */ - BUG(); - } - - ASSERT(sh2_mfn_is_a_page_table(mfn)); - *mfnp = mfn; - return sh2_map_domain_page(mfn) + (vaddr & ~PAGE_MASK); -} - -int -sh2_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src, - u32 bytes, struct x86_emulate_ctxt *ctxt) -{ - ASSERT(shadow2_lock_is_acquired(v->domain)); - while ( bytes > 0 ) - { - mfn_t mfn; - int bytes_on_page; - void *addr; - - bytes_on_page = PAGE_SIZE - (vaddr & ~PAGE_MASK); - if ( bytes_on_page > bytes ) - bytes_on_page = bytes; - - if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL ) - return X86EMUL_PROPAGATE_FAULT; - memcpy(addr, src, bytes_on_page); - shadow2_validate_guest_pt_write(v, mfn, addr, bytes_on_page); - bytes -= bytes_on_page; - /* If we are writing zeros to this page, might want to unshadow */ - if ( *(u8 *)addr == 0 ) - check_for_early_unshadow(v, mfn); - sh2_unmap_domain_page(addr); - } - shadow2_audit_tables(v); - return X86EMUL_CONTINUE; -} - -int -sh2_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr, - unsigned long old, unsigned long new, - unsigned int bytes, struct x86_emulate_ctxt *ctxt) -{ - mfn_t mfn; - void *addr; - unsigned long prev; - int rv = X86EMUL_CONTINUE; - - ASSERT(shadow2_lock_is_acquired(v->domain)); - ASSERT(bytes <= sizeof (unsigned long)); - - if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL ) - return X86EMUL_PROPAGATE_FAULT; - - switch (bytes) - { - case 1: prev = cmpxchg(((u8 *)addr), old, new); break; - case 2: prev = cmpxchg(((u16 *)addr), old, new); break; - case 4: prev = cmpxchg(((u32 *)addr), old, new); break; - case 8: prev = cmpxchg(((u64 *)addr), old, new); break; - default: - SHADOW2_PRINTK("cmpxchg of size %i is not supported\n", bytes); - prev = ~old; - } - - if ( (prev == old) ) - shadow2_validate_guest_pt_write(v, mfn, addr, bytes); - else - rv = X86EMUL_CMPXCHG_FAILED; - - SHADOW2_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx" - " wanted %#lx now %#lx bytes %u\n", - vaddr, prev, old, new, *(unsigned long *)addr, bytes); - - /* If we are writing zeros to this page, might want to unshadow */ - if ( *(u8 *)addr == 0 ) - check_for_early_unshadow(v, mfn); - - sh2_unmap_domain_page(addr); - shadow2_audit_tables(v); - check_for_early_unshadow(v, mfn); - return rv; -} - -int -sh2_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr, - unsigned long old_lo, unsigned long old_hi, - unsigned long new_lo, unsigned long new_hi, - struct x86_emulate_ctxt *ctxt) -{ - mfn_t mfn; - void *addr; - u64 old, new, prev; - int rv = X86EMUL_CONTINUE; - - ASSERT(shadow2_lock_is_acquired(v->domain)); - - if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL ) - return X86EMUL_PROPAGATE_FAULT; - - old = (((u64) old_hi) << 32) | (u64) old_lo; - new = (((u64) new_hi) << 32) | (u64) new_lo; - prev = cmpxchg(((u64 *)addr), old, new); - - if ( (prev == old) ) - shadow2_validate_guest_pt_write(v, mfn, addr, 8); - else - rv = X86EMUL_CMPXCHG_FAILED; - - /* If we are writing zeros to this page, might want to unshadow */ - if ( *(u8 *)addr == 0 ) - check_for_early_unshadow(v, mfn); - - sh2_unmap_domain_page(addr); - shadow2_audit_tables(v); - check_for_early_unshadow(v, mfn); - return rv; -} - - -/**************************************************************************/ -/* Audit tools */ - -#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES - -#define AUDIT_FAIL(_level, _fmt, _a...) do { \ - printk("Shadow2 %u-on-%u audit failed at level %i, index %i\n" \ - "gl" #_level "mfn = %" SH2_PRI_mfn \ - " sl" #_level "mfn = %" SH2_PRI_mfn \ - " &gl" #_level "e = %p &sl" #_level "e = %p" \ - " gl" #_level "e = %" SH2_PRI_gpte \ - " sl" #_level "e = %" SH2_PRI_pte "\nError: " _fmt "\n", \ - GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ - _level, guest_index(gl ## _level ## e), \ - mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ - gl ## _level ## e, sl ## _level ## e, \ - gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \ - ##_a); \ - BUG(); \ - done = 1; \ -} while (0) - - -static char * sh2_audit_flags(struct vcpu *v, int level, - int gflags, int sflags) -/* Common code for auditing flag bits */ -{ - if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) ) - return "shadow is present but guest is not present"; - if ( (sflags & _PAGE_GLOBAL) && !hvm_guest(v) ) - return "global bit set in PV shadow"; - if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE))) - && ((sflags & _PAGE_DIRTY) && !(gflags & _PAGE_DIRTY)) ) - return "dirty bit not propagated"; - if ( level == 2 && (sflags & _PAGE_PSE) ) - return "PS bit set in shadow"; -#if SHADOW_PAGING_LEVELS == 3 - if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */ -#endif - if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) ) - return "user/supervisor bit does not match"; - if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) ) - return "NX bit does not match"; - if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) ) - return "shadow grants write access but guest does not"; - if ( (sflags & _PAGE_ACCESSED) && !(gflags & _PAGE_ACCESSED) ) - return "accessed bit not propagated"; - return NULL; -} - -static inline mfn_t -audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn) -/* Convert this gfn to an mfn in the manner appropriate for the - * guest pagetable it's used in (gmfn) */ -{ - if ( !shadow2_mode_translate(v->domain) ) - return _mfn(gfn_x(gfn)); - - if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask) - != PGT_writable_page ) - return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */ - else - return sh2_gfn_to_mfn(v->domain, gfn_x(gfn)); -} - - -int sh2_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x) -{ - guest_l1e_t *gl1e, *gp; - shadow_l1e_t *sl1e; - mfn_t mfn, gmfn, gl1mfn; - gfn_t gfn; - char *s; - int done = 0; - - /* Follow the backpointer */ - gl1mfn = _mfn(mfn_to_page(sl1mfn)->u.inuse.type_info); - gl1e = gp = sh2_map_domain_page(gl1mfn); - SHADOW2_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, { - - s = sh2_audit_flags(v, 1, guest_l1e_get_flags(*gl1e), - shadow_l1e_get_flags(*sl1e)); - if ( s ) AUDIT_FAIL(1, "%s", s); - - if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS ) - { - gfn = guest_l1e_get_gfn(*gl1e); - mfn = shadow_l1e_get_mfn(*sl1e); - gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn); - if ( mfn_x(gmfn) != mfn_x(mfn) ) - AUDIT_FAIL(1, "bad translation: gfn %" SH2_PRI_gfn - " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n", - gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn)); - } - }); - sh2_unmap_domain_page(gp); - return done; -} - -int sh2_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x) -{ - guest_l1e_t *gl1e, e; - shadow_l1e_t *sl1e; - mfn_t gl1mfn = _mfn(INVALID_MFN); - int f; - int done = 0; - - /* fl1 has no useful backpointer: all we can check are flags */ - e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */ - SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done, { - f = shadow_l1e_get_flags(*sl1e); - f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2); - if ( !(f == 0 - || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| - _PAGE_ACCESSED|_PAGE_DIRTY) - || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)) ) - AUDIT_FAIL(1, "fl1e has bad flags"); - }); - return 0; -} - -int sh2_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x) -{ - guest_l2e_t *gl2e, *gp; - shadow_l2e_t *sl2e; - mfn_t mfn, gmfn, gl2mfn; - gfn_t gfn; - char *s; - int done = 0; -#if GUEST_PAGING_LEVELS != 4 - int xen_mappings = !shadow2_mode_external(v->domain); -#endif - - /* Follow the backpointer */ - gl2mfn = _mfn(mfn_to_page(sl2mfn)->u.inuse.type_info); - gl2e = gp = sh2_map_domain_page(gl2mfn); - SHADOW2_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, xen_mappings, { - - s = sh2_audit_flags(v, 2, guest_l2e_get_flags(*gl2e), - shadow_l2e_get_flags(*sl2e)); - if ( s ) AUDIT_FAIL(2, "%s", s); - - if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS ) - { - gfn = guest_l2e_get_gfn(*gl2e); - mfn = shadow_l2e_get_mfn(*sl2e); - gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) - ? get_fl1_shadow_status(v, gfn) - : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn), - PGC_SH2_l1_shadow); - if ( mfn_x(gmfn) != mfn_x(mfn) ) - AUDIT_FAIL(2, "bad translation: gfn %" SH2_PRI_gfn - " (--> %" SH2_PRI_mfn ")" - " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n", - gfn_x(gfn), - (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0 - : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)), - mfn_x(gmfn), mfn_x(mfn)); - } - }); - sh2_unmap_domain_page(gp); - return 0; -} - -#if GUEST_PAGING_LEVELS >= 3 -int sh2_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x) -{ - guest_l3e_t *gl3e, *gp; - shadow_l3e_t *sl3e; - mfn_t mfn, gmfn, gl3mfn; - gfn_t gfn; - char *s; - int done = 0; - - /* Follow the backpointer */ - gl3mfn = _mfn(mfn_to_page(sl3mfn)->u.inuse.type_info); - gl3e = gp = sh2_map_domain_page(gl3mfn); - SHADOW2_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, { - - s = sh2_audit_flags(v, 3, guest_l3e_get_flags(*gl3e), - shadow_l3e_get_flags(*sl3e)); - if ( s ) AUDIT_FAIL(3, "%s", s); - - if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS ) - { - gfn = guest_l3e_get_gfn(*gl3e); - mfn = shadow_l3e_get_mfn(*sl3e); - gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn), - (GUEST_PAGING_LEVELS == 3 - && !shadow2_mode_external(v->domain) - && (guest_index(gl3e) % 4) == 3) - ? PGC_SH2_l2h_pae_shadow - : PGC_SH2_l2_shadow); - if ( mfn_x(gmfn) != mfn_x(mfn) ) - AUDIT_FAIL(3, "bad translation: gfn %" SH2_PRI_gfn - " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n", - gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn)); - } - }); - sh2_unmap_domain_page(gp); - return 0; -} -#endif /* GUEST_PAGING_LEVELS >= 3 */ - -#if GUEST_PAGING_LEVELS >= 4 -int sh2_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x) -{ - guest_l4e_t *gl4e, *gp; - shadow_l4e_t *sl4e; - mfn_t mfn, gmfn, gl4mfn; - gfn_t gfn; - char *s; - int done = 0; - int xen_mappings = !shadow2_mode_external(v->domain); - - /* Follow the backpointer */ - gl4mfn = _mfn(mfn_to_page(sl4mfn)->u.inuse.type_info); - gl4e = gp = sh2_map_domain_page(gl4mfn); - SHADOW2_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, xen_mappings, - { - s = sh2_audit_flags(v, 4, guest_l4e_get_flags(*gl4e), - shadow_l4e_get_flags(*sl4e)); - if ( s ) AUDIT_FAIL(4, "%s", s); - - if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS ) - { - gfn = guest_l4e_get_gfn(*gl4e); - mfn = shadow_l4e_get_mfn(*sl4e); - gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn), - PGC_SH2_l3_shadow); - if ( mfn_x(gmfn) != mfn_x(mfn) ) - AUDIT_FAIL(4, "bad translation: gfn %" SH2_PRI_gfn - " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n", - gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn)); - } - }); - sh2_unmap_domain_page(gp); - return 0; -} -#endif /* GUEST_PAGING_LEVELS >= 4 */ - - -#undef AUDIT_FAIL - -#endif /* Audit code */ - -/**************************************************************************/ -/* Entry points into this mode of the shadow code. - * This will all be mangled by the preprocessor to uniquify everything. */ -struct shadow2_paging_mode sh2_paging_mode = { - .page_fault = sh2_page_fault, - .invlpg = sh2_invlpg, - .gva_to_gpa = sh2_gva_to_gpa, - .gva_to_gfn = sh2_gva_to_gfn, - .update_cr3 = sh2_update_cr3, - .map_and_validate_gl1e = sh2_map_and_validate_gl1e, - .map_and_validate_gl2e = sh2_map_and_validate_gl2e, - .map_and_validate_gl2he = sh2_map_and_validate_gl2he, - .map_and_validate_gl3e = sh2_map_and_validate_gl3e, - .map_and_validate_gl4e = sh2_map_and_validate_gl4e, - .detach_old_tables = sh2_detach_old_tables, - .x86_emulate_write = sh2_x86_emulate_write, - .x86_emulate_cmpxchg = sh2_x86_emulate_cmpxchg, - .x86_emulate_cmpxchg8b = sh2_x86_emulate_cmpxchg8b, - .make_monitor_table = sh2_make_monitor_table, - .destroy_monitor_table = sh2_destroy_monitor_table, -#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC - .guess_wrmap = sh2_guess_wrmap, -#endif - .guest_levels = GUEST_PAGING_LEVELS, - .shadow_levels = SHADOW_PAGING_LEVELS, -}; - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index f47a682297..7d188ceef3 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -870,8 +870,8 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs) if ( unlikely(IN_HYPERVISOR_RANGE(addr)) ) { - if ( shadow2_mode_external(d) && guest_mode(regs) ) - return shadow2_fault(addr, regs); + if ( shadow_mode_external(d) && guest_mode(regs) ) + return shadow_fault(addr, regs); if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) ) return handle_gdt_ldt_mapping_fault( addr - GDT_LDT_VIRT_START, regs); @@ -890,8 +890,8 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs) ptwr_do_page_fault(d, addr, regs) ) return EXCRET_fault_fixed; - if ( shadow2_mode_enabled(d) ) - return shadow2_fault(addr, regs); + if ( shadow_mode_enabled(d) ) + return shadow_fault(addr, regs); return 0; } diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index cd66f2a0c4..f1b7c7cc7b 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -59,10 +59,10 @@ extern void hypercall_page_initialise(struct domain *d, void *); struct shadow_domain { u32 mode; /* flags to control shadow operation */ - spinlock_t lock; /* shadow2 domain lock */ + spinlock_t lock; /* shadow domain lock */ int locker; /* processor which holds the lock */ const char *locker_function; /* Func that took it */ - struct list_head freelists[SHADOW2_MAX_ORDER + 1]; + struct list_head freelists[SHADOW_MAX_ORDER + 1]; struct list_head p2m_freelist; struct list_head p2m_inuse; struct list_head toplevel_shadows; @@ -70,10 +70,10 @@ struct shadow_domain { unsigned int free_pages; /* number of pages on freelists */ unsigned int p2m_pages; /* number of pages in p2m map */ - /* Shadow2 hashtable */ - struct shadow2_hash_entry *hash_table; - struct shadow2_hash_entry *hash_freelist; - struct shadow2_hash_entry *hash_allocations; + /* Shadow hashtable */ + struct shadow_hash_entry *hash_table; + struct shadow_hash_entry *hash_freelist; + struct shadow_hash_entry *hash_allocations; int hash_walking; /* Some function is walking the hash table */ /* Shadow log-dirty bitmap */ @@ -107,7 +107,7 @@ struct arch_domain /* Shadow-translated guest: Pseudophys base address of reserved area. */ unsigned long first_reserved_pfn; - struct shadow_domain shadow2; + struct shadow_domain shadow; /* Shadow translated domain: P2M mapping */ pagetable_t phys_table; @@ -135,7 +135,7 @@ struct pae_l3_cache { }; struct shadow_vcpu { /* Pointers to mode-specific entry points. */ - struct shadow2_paging_mode *mode; + struct shadow_paging_mode *mode; /* Last MFN that we emulated a write to. */ unsigned long last_emulated_mfn; /* HVM guest: paging enabled (CR0.PG)? */ @@ -201,7 +201,7 @@ struct arch_vcpu /* Current LDT details. */ unsigned long shadow_ldt_mapcnt; - struct shadow_vcpu shadow2; + struct shadow_vcpu shadow; } __cacheline_aligned; /* shorthands to improve code legibility */ diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 106e486013..2acdd2f23d 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -22,7 +22,7 @@ struct page_info /* Each frame can be threaded onto a doubly-linked list. */ union { struct list_head list; - /* Shadow2 uses this field as an up-pointer in lower-level shadows */ + /* Shadow uses this field as an up-pointer in lower-level shadows */ paddr_t up; }; @@ -59,7 +59,7 @@ struct page_info /* Only used on guest pages with a shadow. * Guest pages with a shadow must have a non-zero type count, so this * does not conflict with the tlbflush timestamp. */ - u32 shadow2_flags; + u32 shadow_flags; // XXX -- we expect to add another field here, to be used for min/max // purposes, which is only used for shadow pages. @@ -76,7 +76,7 @@ struct page_info #define PGT_ldt_page (6U<<29) /* using this page in an LDT? */ #define PGT_writable_page (7U<<29) /* has writable mappings of this page? */ -#ifndef SHADOW2 +#ifndef SHADOW #define PGT_l1_shadow PGT_l1_page_table #define PGT_l2_shadow PGT_l2_page_table #define PGT_l3_shadow PGT_l3_page_table @@ -117,7 +117,7 @@ struct page_info /* 16-bit count of uses of this frame as its current type. */ #define PGT_count_mask ((1U<<16)-1) -#ifndef SHADOW2 +#ifndef SHADOW #ifdef __x86_64__ #define PGT_high_mfn_shift 52 #define PGT_high_mfn_mask (0xfffUL << PGT_high_mfn_shift) @@ -132,7 +132,7 @@ struct page_info #define PGT_score_shift 23 #define PGT_score_mask (((1U<<4)-1)<u.inuse._domain)) @@ -227,7 +227,7 @@ extern void invalidate_shadow_ldt(struct vcpu *d); extern int shadow_remove_all_write_access( struct domain *d, unsigned long gmfn, unsigned long mfn); extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn); -extern int _shadow2_mode_refcounts(struct domain *d); +extern int _shadow_mode_refcounts(struct domain *d); static inline void put_page(struct page_info *page) { @@ -259,7 +259,7 @@ static inline int get_page(struct page_info *page, unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */ unlikely(d != _domain) ) /* Wrong owner? */ { - if ( !_shadow2_mode_refcounts(domain) ) + if ( !_shadow_mode_refcounts(domain) ) DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%" PRtype_info "\n", page_to_mfn(page), domain, unpickle_domptr(d), @@ -345,11 +345,11 @@ int check_descriptor(struct desc_struct *d); #define mfn_to_gmfn(_d, mfn) \ - ( (shadow2_mode_translate(_d)) \ + ( (shadow_mode_translate(_d)) \ ? get_gpfn_from_mfn(mfn) \ : (mfn) ) -#define gmfn_to_mfn(_d, gpfn) mfn_x(sh2_gfn_to_mfn(_d, gpfn)) +#define gmfn_to_mfn(_d, gpfn) mfn_x(sh_gfn_to_mfn(_d, gpfn)) /* diff --git a/xen/include/asm-x86/page-guest32.h b/xen/include/asm-x86/page-guest32.h deleted file mode 100644 index e93206169a..0000000000 --- a/xen/include/asm-x86/page-guest32.h +++ /dev/null @@ -1,105 +0,0 @@ - -#ifndef __X86_PAGE_GUEST_H__ -#define __X86_PAGE_GUEST_H__ - -#ifndef __ASSEMBLY__ -# include -#endif - -#define PAGETABLE_ORDER_32 10 -#define L1_PAGETABLE_ENTRIES_32 (1<> L1_PAGETABLE_SHIFT_32) & (L1_PAGETABLE_ENTRIES_32 - 1)) -#define l2_table_offset_32(a) \ - (((a) >> L2_PAGETABLE_SHIFT_32) & (L2_PAGETABLE_ENTRIES_32 - 1)) - -#define linear_l1_table_32 \ - ((l1_pgentry_32_t *)(LINEAR_PT_VIRT_START)) - -#define linear_pg_table_32 linear_l1_table_32 - -#endif /* __X86_PAGE_GUEST_H__ */ - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/xen/include/asm-x86/perfc_defn.h b/xen/include/asm-x86/perfc_defn.h index ae6e3d2b9b..73b9ffe9ba 100644 --- a/xen/include/asm-x86/perfc_defn.h +++ b/xen/include/asm-x86/perfc_defn.h @@ -30,59 +30,59 @@ PERFCOUNTER_CPU(ptwr_emulations, "writable pt emulations") PERFCOUNTER_CPU(exception_fixed, "pre-exception fixed") -/* Shadow2 counters */ -PERFCOUNTER_CPU(shadow2_alloc, "calls to shadow2_alloc") -PERFCOUNTER_CPU(shadow2_alloc_tlbflush, "shadow2_alloc flushed TLBs") +/* Shadow counters */ +PERFCOUNTER_CPU(shadow_alloc, "calls to shadow_alloc") +PERFCOUNTER_CPU(shadow_alloc_tlbflush, "shadow_alloc flushed TLBs") /* STATUS counters do not reset when 'P' is hit */ -PERFSTATUS(shadow2_alloc_count, "number of shadow pages in use") -PERFCOUNTER_CPU(shadow2_free, "calls to shadow2_free") -PERFCOUNTER_CPU(shadow2_prealloc_1, "shadow2 recycles old shadows") -PERFCOUNTER_CPU(shadow2_prealloc_2, "shadow2 recycles in-use shadows") -PERFCOUNTER_CPU(shadow2_linear_map_failed, "shadow2 hit read-only linear map") -PERFCOUNTER_CPU(shadow2_a_update, "shadow2 A bit update") -PERFCOUNTER_CPU(shadow2_ad_update, "shadow2 A&D bit update") -PERFCOUNTER_CPU(shadow2_fault, "calls to shadow2_fault") -PERFCOUNTER_CPU(shadow2_fault_bail_bad_gfn, "shadow2_fault guest bad gfn") -PERFCOUNTER_CPU(shadow2_fault_bail_not_present, - "shadow2_fault guest not-present") -PERFCOUNTER_CPU(shadow2_fault_bail_nx, "shadow2_fault guest NX fault") -PERFCOUNTER_CPU(shadow2_fault_bail_ro_mapping, "shadow2_fault guest R/W fault") -PERFCOUNTER_CPU(shadow2_fault_bail_user_supervisor, - "shadow2_fault guest U/S fault") -PERFCOUNTER_CPU(shadow2_fault_emulate_read, "shadow2_fault emulates a read") -PERFCOUNTER_CPU(shadow2_fault_emulate_write, "shadow2_fault emulates a write") -PERFCOUNTER_CPU(shadow2_fault_emulate_failed, "shadow2_fault emulator fails") -PERFCOUNTER_CPU(shadow2_fault_mmio, "shadow2_fault handled as mmio") -PERFCOUNTER_CPU(shadow2_fault_fixed, "shadow2_fault fixed fault") -PERFCOUNTER_CPU(shadow2_ptwr_emulate, "shadow2 causes ptwr to emulate") -PERFCOUNTER_CPU(shadow2_validate_gl1e_calls, "calls to shadow2_validate_gl1e") -PERFCOUNTER_CPU(shadow2_validate_gl2e_calls, "calls to shadow2_validate_gl2e") -PERFCOUNTER_CPU(shadow2_validate_gl3e_calls, "calls to shadow2_validate_gl3e") -PERFCOUNTER_CPU(shadow2_validate_gl4e_calls, "calls to shadow2_validate_gl4e") -PERFCOUNTER_CPU(shadow2_hash_lookups, "calls to shadow2_hash_lookup") -PERFCOUNTER_CPU(shadow2_hash_lookup_head, "shadow2 hash hit in bucket head") -PERFCOUNTER_CPU(shadow2_hash_lookup_miss, "shadow2 hash misses") -PERFCOUNTER_CPU(shadow2_get_shadow_status, "calls to get_shadow_status") -PERFCOUNTER_CPU(shadow2_hash_inserts, "calls to shadow2_hash_insert") -PERFCOUNTER_CPU(shadow2_hash_deletes, "calls to shadow2_hash_delete") -PERFCOUNTER_CPU(shadow2_writeable, "shadow2 removes write access") -PERFCOUNTER_CPU(shadow2_writeable_h_1, "shadow2 writeable: 32b w2k3") -PERFCOUNTER_CPU(shadow2_writeable_h_2, "shadow2 writeable: 32pae w2k3") -PERFCOUNTER_CPU(shadow2_writeable_h_3, "shadow2 writeable: 64b w2k3") -PERFCOUNTER_CPU(shadow2_writeable_h_4, "shadow2 writeable: 32b linux low") -PERFCOUNTER_CPU(shadow2_writeable_bf, "shadow2 writeable brute-force") -PERFCOUNTER_CPU(shadow2_mappings, "shadow2 removes all mappings") -PERFCOUNTER_CPU(shadow2_mappings_bf, "shadow2 rm-mappings brute-force") -PERFCOUNTER_CPU(shadow2_early_unshadow, "shadow2 unshadows for fork/exit") -PERFCOUNTER_CPU(shadow2_early_unshadow_top, "shadow2 unhooks for fork/exit") -PERFCOUNTER_CPU(shadow2_unshadow, "shadow2 unshadows a page") -PERFCOUNTER_CPU(shadow2_up_pointer, "shadow2 unshadow by up-pointer") -PERFCOUNTER_CPU(shadow2_unshadow_bf, "shadow2 unshadow brute-force") -PERFCOUNTER_CPU(shadow2_get_page_fail, "shadow2_get_page_from_l1e failed") -PERFCOUNTER_CPU(shadow2_guest_walk, "shadow2 walks guest tables") -PERFCOUNTER_CPU(shadow2_walk_cache_hit, "shadow2 walk-cache hits") -PERFCOUNTER_CPU(shadow2_walk_cache_miss, "shadow2 walk-cache misses") +PERFSTATUS(shadow_alloc_count, "number of shadow pages in use") +PERFCOUNTER_CPU(shadow_free, "calls to shadow_free") +PERFCOUNTER_CPU(shadow_prealloc_1, "shadow recycles old shadows") +PERFCOUNTER_CPU(shadow_prealloc_2, "shadow recycles in-use shadows") +PERFCOUNTER_CPU(shadow_linear_map_failed, "shadow hit read-only linear map") +PERFCOUNTER_CPU(shadow_a_update, "shadow A bit update") +PERFCOUNTER_CPU(shadow_ad_update, "shadow A&D bit update") +PERFCOUNTER_CPU(shadow_fault, "calls to shadow_fault") +PERFCOUNTER_CPU(shadow_fault_bail_bad_gfn, "shadow_fault guest bad gfn") +PERFCOUNTER_CPU(shadow_fault_bail_not_present, + "shadow_fault guest not-present") +PERFCOUNTER_CPU(shadow_fault_bail_nx, "shadow_fault guest NX fault") +PERFCOUNTER_CPU(shadow_fault_bail_ro_mapping, "shadow_fault guest R/W fault") +PERFCOUNTER_CPU(shadow_fault_bail_user_supervisor, + "shadow_fault guest U/S fault") +PERFCOUNTER_CPU(shadow_fault_emulate_read, "shadow_fault emulates a read") +PERFCOUNTER_CPU(shadow_fault_emulate_write, "shadow_fault emulates a write") +PERFCOUNTER_CPU(shadow_fault_emulate_failed, "shadow_fault emulator fails") +PERFCOUNTER_CPU(shadow_fault_mmio, "shadow_fault handled as mmio") +PERFCOUNTER_CPU(shadow_fault_fixed, "shadow_fault fixed fault") +PERFCOUNTER_CPU(shadow_ptwr_emulate, "shadow causes ptwr to emulate") +PERFCOUNTER_CPU(shadow_validate_gl1e_calls, "calls to shadow_validate_gl1e") +PERFCOUNTER_CPU(shadow_validate_gl2e_calls, "calls to shadow_validate_gl2e") +PERFCOUNTER_CPU(shadow_validate_gl3e_calls, "calls to shadow_validate_gl3e") +PERFCOUNTER_CPU(shadow_validate_gl4e_calls, "calls to shadow_validate_gl4e") +PERFCOUNTER_CPU(shadow_hash_lookups, "calls to shadow_hash_lookup") +PERFCOUNTER_CPU(shadow_hash_lookup_head, "shadow hash hit in bucket head") +PERFCOUNTER_CPU(shadow_hash_lookup_miss, "shadow hash misses") +PERFCOUNTER_CPU(shadow_get_shadow_status, "calls to get_shadow_status") +PERFCOUNTER_CPU(shadow_hash_inserts, "calls to shadow_hash_insert") +PERFCOUNTER_CPU(shadow_hash_deletes, "calls to shadow_hash_delete") +PERFCOUNTER_CPU(shadow_writeable, "shadow removes write access") +PERFCOUNTER_CPU(shadow_writeable_h_1, "shadow writeable: 32b w2k3") +PERFCOUNTER_CPU(shadow_writeable_h_2, "shadow writeable: 32pae w2k3") +PERFCOUNTER_CPU(shadow_writeable_h_3, "shadow writeable: 64b w2k3") +PERFCOUNTER_CPU(shadow_writeable_h_4, "shadow writeable: 32b linux low") +PERFCOUNTER_CPU(shadow_writeable_bf, "shadow writeable brute-force") +PERFCOUNTER_CPU(shadow_mappings, "shadow removes all mappings") +PERFCOUNTER_CPU(shadow_mappings_bf, "shadow rm-mappings brute-force") +PERFCOUNTER_CPU(shadow_early_unshadow, "shadow unshadows for fork/exit") +PERFCOUNTER_CPU(shadow_early_unshadow_top, "shadow unhooks for fork/exit") +PERFCOUNTER_CPU(shadow_unshadow, "shadow unshadows a page") +PERFCOUNTER_CPU(shadow_up_pointer, "shadow unshadow by up-pointer") +PERFCOUNTER_CPU(shadow_unshadow_bf, "shadow unshadow brute-force") +PERFCOUNTER_CPU(shadow_get_page_fail, "shadow_get_page_from_l1e failed") +PERFCOUNTER_CPU(shadow_guest_walk, "shadow walks guest tables") +PERFCOUNTER_CPU(shadow_walk_cache_hit, "shadow walk-cache hits") +PERFCOUNTER_CPU(shadow_walk_cache_miss, "shadow walk-cache misses") /*#endif*/ /* __XEN_PERFC_DEFN_H__ */ diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h index efade3021c..f15559ba0a 100644 --- a/xen/include/asm-x86/shadow.h +++ b/xen/include/asm-x86/shadow.h @@ -1,7 +1,9 @@ /****************************************************************************** * include/asm-x86/shadow.h * - * Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -21,26 +23,608 @@ #ifndef _XEN_SHADOW_H #define _XEN_SHADOW_H -/* This file is just a wrapper around the new Shadow2 header, - * providing names that must be defined in any shadow implementation. */ - -#include +#include +#include +#include +#include /* How to make sure a page is not referred to in a shadow PT */ /* This will need to be a for_each_vcpu if we go to per-vcpu shadows */ #define shadow_drop_references(_d, _p) \ - shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p))) + shadow_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p))) #define shadow_sync_and_drop_references(_d, _p) \ - shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p))) - -/* Whether we are translating the domain's frame numbers for it */ -#define shadow_mode_translate(d) shadow2_mode_translate(d) + shadow_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p))) -/* ...and if so, how to add and remove entries in the mapping */ +/* How to add and remove entries in the p2m mapping. */ #define guest_physmap_add_page(_d, _p, _m) \ - shadow2_guest_physmap_add_page((_d), (_p), (_m)) + shadow_guest_physmap_add_page((_d), (_p), (_m)) #define guest_physmap_remove_page(_d, _p, _m ) \ - shadow2_guest_physmap_remove_page((_d), (_p), (_m)) + shadow_guest_physmap_remove_page((_d), (_p), (_m)) + +/* Shadow PT operation mode : shadow-mode variable in arch_domain. */ + +#define SHM2_shift 10 +/* We're in one of the shadow modes */ +#define SHM2_enable (1U << SHM2_shift) +/* Refcounts based on shadow tables instead of guest tables */ +#define SHM2_refcounts (XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT << SHM2_shift) +/* Enable log dirty mode */ +#define SHM2_log_dirty (XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY << SHM2_shift) +/* Xen does p2m translation, not guest */ +#define SHM2_translate (XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE << SHM2_shift) +/* Xen does not steal address space from the domain for its own booking; + * requires VT or similar mechanisms */ +#define SHM2_external (XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL << SHM2_shift) + +#define shadow_mode_enabled(_d) ((_d)->arch.shadow.mode) +#define shadow_mode_refcounts(_d) ((_d)->arch.shadow.mode & SHM2_refcounts) +#define shadow_mode_log_dirty(_d) ((_d)->arch.shadow.mode & SHM2_log_dirty) +#define shadow_mode_translate(_d) ((_d)->arch.shadow.mode & SHM2_translate) +#define shadow_mode_external(_d) ((_d)->arch.shadow.mode & SHM2_external) + +/* Xen traps & emulates all reads of all page table pages: + *not yet supported + */ +#define shadow_mode_trap_reads(_d) ({ (void)(_d); 0; }) + +// flags used in the return value of the shadow_set_lXe() functions... +#define SHADOW_SET_CHANGED 0x1 +#define SHADOW_SET_FLUSH 0x2 +#define SHADOW_SET_ERROR 0x4 +#define SHADOW_SET_L3PAE_RECOPY 0x8 + +// How do we tell that we have a 32-bit PV guest in a 64-bit Xen? +#ifdef __x86_64__ +#define pv_32bit_guest(_v) 0 // not yet supported +#else +#define pv_32bit_guest(_v) !hvm_guest(v) +#endif + +/* The shadow lock. + * + * This lock is per-domain. It is intended to allow us to make atomic + * updates to the software TLB that the shadow tables provide. + * + * Specifically, it protects: + * - all changes to shadow page table pages + * - the shadow hash table + * - the shadow page allocator + * - all changes to guest page table pages; if/when the notion of + * out-of-sync pages is added to this code, then the shadow lock is + * protecting all guest page table pages which are not listed as + * currently as both guest-writable and out-of-sync... + * XXX -- need to think about this relative to writable page tables. + * - all changes to the page_info->tlbflush_timestamp + * - the page_info->count fields on shadow pages + * - the shadow dirty bit array and count + * - XXX + */ +#ifndef CONFIG_SMP +#error shadow.h currently requires CONFIG_SMP +#endif + +#define shadow_lock_init(_d) \ + do { \ + spin_lock_init(&(_d)->arch.shadow.lock); \ + (_d)->arch.shadow.locker = -1; \ + (_d)->arch.shadow.locker_function = "nobody"; \ + } while (0) + +#define shadow_lock_is_acquired(_d) \ + (current->processor == (_d)->arch.shadow.locker) + +#define shadow_lock(_d) \ + do { \ + if ( unlikely((_d)->arch.shadow.locker == current->processor) ) \ + { \ + printk("Error: shadow lock held by %s\n", \ + (_d)->arch.shadow.locker_function); \ + BUG(); \ + } \ + spin_lock(&(_d)->arch.shadow.lock); \ + ASSERT((_d)->arch.shadow.locker == -1); \ + (_d)->arch.shadow.locker = current->processor; \ + (_d)->arch.shadow.locker_function = __func__; \ + } while (0) + +#define shadow_unlock(_d) \ + do { \ + ASSERT((_d)->arch.shadow.locker == current->processor); \ + (_d)->arch.shadow.locker = -1; \ + (_d)->arch.shadow.locker_function = "nobody"; \ + spin_unlock(&(_d)->arch.shadow.lock); \ + } while (0) + +/* + * Levels of self-test and paranoia + * XXX should go in config files somewhere? + */ +#define SHADOW_AUDIT_HASH 0x01 /* Check current hash bucket */ +#define SHADOW_AUDIT_HASH_FULL 0x02 /* Check every hash bucket */ +#define SHADOW_AUDIT_ENTRIES 0x04 /* Check this walk's shadows */ +#define SHADOW_AUDIT_ENTRIES_FULL 0x08 /* Check every shadow */ +#define SHADOW_AUDIT_ENTRIES_MFNS 0x10 /* Check gfn-mfn map in shadows */ +#define SHADOW_AUDIT_P2M 0x20 /* Check the p2m table */ + +#ifdef NDEBUG +#define SHADOW_AUDIT 0 +#define SHADOW_AUDIT_ENABLE 0 +#else +#define SHADOW_AUDIT 0x15 /* Basic audit of all except p2m. */ +#define SHADOW_AUDIT_ENABLE shadow_audit_enable +extern int shadow_audit_enable; +#endif + +/* + * Levels of optimization + * XXX should go in config files somewhere? + */ +#define SHOPT_WRITABLE_HEURISTIC 0x01 /* Guess at RW PTEs via linear maps */ +#define SHOPT_EARLY_UNSHADOW 0x02 /* Unshadow l1s on fork or exit */ + +#define SHADOW_OPTIMIZATIONS 0x03 + + +/* With shadow pagetables, the different kinds of address start + * to get get confusing. + * + * Virtual addresses are what they usually are: the addresses that are used + * to accessing memory while the guest is running. The MMU translates from + * virtual addresses to machine addresses. + * + * (Pseudo-)physical addresses are the abstraction of physical memory the + * guest uses for allocation and so forth. For the purposes of this code, + * we can largely ignore them. + * + * Guest frame numbers (gfns) are the entries that the guest puts in its + * pagetables. For normal paravirtual guests, they are actual frame numbers, + * with the translation done by the guest. + * + * Machine frame numbers (mfns) are the entries that the hypervisor puts + * in the shadow page tables. + * + * Elsewhere in the xen code base, the name "gmfn" is generally used to refer + * to a "machine frame number, from the guest's perspective", or in other + * words, pseudo-physical frame numbers. However, in the shadow code, the + * term "gmfn" means "the mfn of a guest page"; this combines naturally with + * other terms such as "smfn" (the mfn of a shadow page), gl2mfn (the mfn of a + * guest L2 page), etc... + */ + +/* With this defined, we do some ugly things to force the compiler to + * give us type safety between mfns and gfns and other integers. + * TYPE_SAFE(int foo) defines a foo_t, and _foo() and foo_x() functions + * that translate beween int and foo_t. + * + * It does have some performance cost because the types now have + * a different storage attribute, so may not want it on all the time. */ +#ifndef NDEBUG +#define TYPE_SAFETY 1 +#endif + +#ifdef TYPE_SAFETY +#define TYPE_SAFE(_type,_name) \ +typedef struct { _type _name; } _name##_t; \ +static inline _name##_t _##_name(_type n) { return (_name##_t) { n }; } \ +static inline _type _name##_x(_name##_t n) { return n._name; } +#else +#define TYPE_SAFE(_type,_name) \ +typedef _type _name##_t; \ +static inline _name##_t _##_name(_type n) { return n; } \ +static inline _type _name##_x(_name##_t n) { return n; } +#endif + +TYPE_SAFE(unsigned long,mfn) +#define SH_PRI_mfn "05lx" + +static inline int +valid_mfn(mfn_t m) +{ + return VALID_MFN(mfn_x(m)); +} + +static inline mfn_t +pagetable_get_mfn(pagetable_t pt) +{ + return _mfn(pagetable_get_pfn(pt)); +} + +static inline pagetable_t +pagetable_from_mfn(mfn_t mfn) +{ + return pagetable_from_pfn(mfn_x(mfn)); +} + +static inline int +shadow_vcpu_mode_translate(struct vcpu *v) +{ + // Returns true if this VCPU needs to be using the P2M table to translate + // between GFNs and MFNs. + // + // This is true of translated HVM domains on a vcpu which has paging + // enabled. (HVM vcpu's with paging disabled are using the p2m table as + // its paging table, so no translation occurs in this case.) + // + return v->arch.shadow.hvm_paging_enabled; +} + + +/**************************************************************************/ +/* Mode-specific entry points into the shadow code */ + +struct x86_emulate_ctxt; +struct shadow_paging_mode { + int (*page_fault )(struct vcpu *v, unsigned long va, + struct cpu_user_regs *regs); + int (*invlpg )(struct vcpu *v, unsigned long va); + unsigned long (*gva_to_gpa )(struct vcpu *v, unsigned long va); + unsigned long (*gva_to_gfn )(struct vcpu *v, unsigned long va); + void (*update_cr3 )(struct vcpu *v); + int (*map_and_validate_gl1e )(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry, u32 size); + int (*map_and_validate_gl2e )(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry, u32 size); + int (*map_and_validate_gl2he)(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry, u32 size); + int (*map_and_validate_gl3e )(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry, u32 size); + int (*map_and_validate_gl4e )(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry, u32 size); + void (*detach_old_tables )(struct vcpu *v); + int (*x86_emulate_write )(struct vcpu *v, unsigned long va, + void *src, u32 bytes, + struct x86_emulate_ctxt *ctxt); + int (*x86_emulate_cmpxchg )(struct vcpu *v, unsigned long va, + unsigned long old, + unsigned long new, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt); + int (*x86_emulate_cmpxchg8b )(struct vcpu *v, unsigned long va, + unsigned long old_lo, + unsigned long old_hi, + unsigned long new_lo, + unsigned long new_hi, + struct x86_emulate_ctxt *ctxt); + mfn_t (*make_monitor_table )(struct vcpu *v); + void (*destroy_monitor_table )(struct vcpu *v, mfn_t mmfn); +#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC + int (*guess_wrmap )(struct vcpu *v, + unsigned long vaddr, mfn_t gmfn); +#endif + /* For outsiders to tell what mode we're in */ + unsigned int shadow_levels; + unsigned int guest_levels; +}; + +static inline int shadow_guest_paging_levels(struct vcpu *v) +{ + ASSERT(v->arch.shadow.mode != NULL); + return v->arch.shadow.mode->guest_levels; +} + +/**************************************************************************/ +/* Entry points into the shadow code */ + +/* Turning on shadow test mode */ +int shadow_test_enable(struct domain *d); + +/* Handler for shadow control ops: enabling and disabling shadow modes, + * and log-dirty bitmap ops all happen through here. */ +int shadow_domctl(struct domain *d, + xen_domctl_shadow_op_t *sc, + XEN_GUEST_HANDLE(xen_domctl_t) u_domctl); + +/* Call when destroying a domain */ +void shadow_teardown(struct domain *d); + +/* Call once all of the references to the domain have gone away */ +void shadow_final_teardown(struct domain *d); + + +/* Mark a page as dirty in the bitmap */ +void sh_do_mark_dirty(struct domain *d, mfn_t gmfn); +static inline void mark_dirty(struct domain *d, unsigned long gmfn) +{ + if ( shadow_mode_log_dirty(d) ) + { + shadow_lock(d); + sh_do_mark_dirty(d, _mfn(gmfn)); + shadow_unlock(d); + } +} + +/* Internal version, for when the shadow lock is already held */ +static inline void sh_mark_dirty(struct domain *d, mfn_t gmfn) +{ + ASSERT(shadow_lock_is_acquired(d)); + if ( shadow_mode_log_dirty(d) ) + sh_do_mark_dirty(d, gmfn); +} + +static inline int +shadow_fault(unsigned long va, struct cpu_user_regs *regs) +/* Called from pagefault handler in Xen, and from the HVM trap handlers + * for pagefaults. Returns 1 if this fault was an artefact of the + * shadow code (and the guest should retry) or 0 if it is not (and the + * fault should be handled elsewhere or passed to the guest). */ +{ + struct vcpu *v = current; + perfc_incrc(shadow_fault); + return v->arch.shadow.mode->page_fault(v, va, regs); +} + +static inline int +shadow_invlpg(struct vcpu *v, unsigned long va) +/* Called when the guest requests an invlpg. Returns 1 if the invlpg + * instruction should be issued on the hardware, or 0 if it's safe not + * to do so. */ +{ + return v->arch.shadow.mode->invlpg(v, va); +} + +static inline unsigned long +shadow_gva_to_gpa(struct vcpu *v, unsigned long va) +/* Called to translate a guest virtual address to what the *guest* + * pagetables would map it to. */ +{ + return v->arch.shadow.mode->gva_to_gpa(v, va); +} + +static inline unsigned long +shadow_gva_to_gfn(struct vcpu *v, unsigned long va) +/* Called to translate a guest virtual address to what the *guest* + * pagetables would map it to. */ +{ + return v->arch.shadow.mode->gva_to_gfn(v, va); +} + +static inline void +shadow_update_cr3(struct vcpu *v) +/* Updates all the things that are derived from the guest's CR3. + * Called when the guest changes CR3. */ +{ + shadow_lock(v->domain); + v->arch.shadow.mode->update_cr3(v); + shadow_unlock(v->domain); +} + + +/* Should be called after CR3 is updated. + * Updates vcpu->arch.cr3 and, for HVM guests, vcpu->arch.hvm_vcpu.cpu_cr3. + * + * Also updates other state derived from CR3 (vcpu->arch.guest_vtable, + * shadow_vtable, etc). + * + * Uses values found in vcpu->arch.(guest_table and guest_table_user), and + * for HVM guests, arch.monitor_table and hvm's guest CR3. + * + * Update ref counts to shadow tables appropriately. + * For PAE, relocate L3 entries, if necessary, into low memory. + */ +static inline void update_cr3(struct vcpu *v) +{ + unsigned long cr3_mfn=0; + + if ( shadow_mode_enabled(v->domain) ) + { + shadow_update_cr3(v); + return; + } + +#if CONFIG_PAGING_LEVELS == 4 + if ( !(v->arch.flags & TF_kernel_mode) ) + cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user); + else +#endif + cr3_mfn = pagetable_get_pfn(v->arch.guest_table); + + make_cr3(v, cr3_mfn); +} + +extern void sh_update_paging_modes(struct vcpu *v); + +/* Should be called to initialise paging structures if the paging mode + * has changed, and when bringing up a VCPU for the first time. */ +static inline void shadow_update_paging_modes(struct vcpu *v) +{ + ASSERT(shadow_mode_enabled(v->domain)); + shadow_lock(v->domain); + sh_update_paging_modes(v); + shadow_unlock(v->domain); +} + +static inline void +shadow_detach_old_tables(struct vcpu *v) +{ + if ( v->arch.shadow.mode ) + v->arch.shadow.mode->detach_old_tables(v); +} + +static inline mfn_t +shadow_make_monitor_table(struct vcpu *v) +{ + return v->arch.shadow.mode->make_monitor_table(v); +} + +static inline void +shadow_destroy_monitor_table(struct vcpu *v, mfn_t mmfn) +{ + v->arch.shadow.mode->destroy_monitor_table(v, mmfn); +} + +/* Validate a pagetable change from the guest and update the shadows. */ +extern int shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry); + +/* Update the shadows in response to a pagetable write from a HVM guest */ +extern void shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, + void *entry, u32 size); + +/* Remove all writeable mappings of a guest frame from the shadows. + * Returns non-zero if we need to flush TLBs. + * level and fault_addr desribe how we found this to be a pagetable; + * level==0 means we have some other reason for revoking write access. */ +extern int shadow_remove_write_access(struct vcpu *v, mfn_t readonly_mfn, + unsigned int level, + unsigned long fault_addr); + +/* Remove all mappings of the guest mfn from the shadows. + * Returns non-zero if we need to flush TLBs. */ +extern int shadow_remove_all_mappings(struct vcpu *v, mfn_t target_mfn); + +void +shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn); +/* This is a HVM page that we thing is no longer a pagetable. + * Unshadow it, and recursively unshadow pages that reference it. */ + +/* Remove all shadows of the guest mfn. */ +extern void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int all); +static inline void shadow_remove_all_shadows(struct vcpu *v, mfn_t gmfn) +{ + sh_remove_shadows(v, gmfn, 1); +} + +/* Add a page to a domain */ +void +shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn, + unsigned long mfn); + +/* Remove a page from a domain */ +void +shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn, + unsigned long mfn); + +/* + * Definitions for the shadow_flags field in page_info. + * These flags are stored on *guest* pages... + * Bits 1-13 are encodings for the shadow types. + */ +#define PGC_SH_type_to_index(_type) ((_type) >> PGC_SH_type_shift) +#define SHF_page_type_mask \ + (((1u << (PGC_SH_type_to_index(PGC_SH_max_shadow) + 1u)) - 1u) - \ + ((1u << PGC_SH_type_to_index(PGC_SH_min_shadow)) - 1u)) + +#define SHF_L1_32 (1u << PGC_SH_type_to_index(PGC_SH_l1_32_shadow)) +#define SHF_FL1_32 (1u << PGC_SH_type_to_index(PGC_SH_fl1_32_shadow)) +#define SHF_L2_32 (1u << PGC_SH_type_to_index(PGC_SH_l2_32_shadow)) +#define SHF_L1_PAE (1u << PGC_SH_type_to_index(PGC_SH_l1_pae_shadow)) +#define SHF_FL1_PAE (1u << PGC_SH_type_to_index(PGC_SH_fl1_pae_shadow)) +#define SHF_L2_PAE (1u << PGC_SH_type_to_index(PGC_SH_l2_pae_shadow)) +#define SHF_L2H_PAE (1u << PGC_SH_type_to_index(PGC_SH_l2h_pae_shadow)) +#define SHF_L3_PAE (1u << PGC_SH_type_to_index(PGC_SH_l3_pae_shadow)) +#define SHF_L1_64 (1u << PGC_SH_type_to_index(PGC_SH_l1_64_shadow)) +#define SHF_FL1_64 (1u << PGC_SH_type_to_index(PGC_SH_fl1_64_shadow)) +#define SHF_L2_64 (1u << PGC_SH_type_to_index(PGC_SH_l2_64_shadow)) +#define SHF_L3_64 (1u << PGC_SH_type_to_index(PGC_SH_l3_64_shadow)) +#define SHF_L4_64 (1u << PGC_SH_type_to_index(PGC_SH_l4_64_shadow)) + +/* Used for hysteresis when automatically unhooking mappings on fork/exit */ +#define SHF_unhooked_mappings (1u<<31) + +/* + * Allocation of shadow pages + */ + +/* Return the minumum acceptable number of shadow pages a domain needs */ +unsigned int shadow_min_acceptable_pages(struct domain *d); + +/* Set the pool of shadow pages to the required number of MB. + * Input will be rounded up to at least min_acceptable_shadow_pages(). + * Returns 0 for success, 1 for failure. */ +unsigned int shadow_set_allocation(struct domain *d, + unsigned int megabytes, + int *preempted); + +/* Return the size of the shadow pool, rounded up to the nearest MB */ +static inline unsigned int shadow_get_allocation(struct domain *d) +{ + unsigned int pg = d->arch.shadow.total_pages; + return ((pg >> (20 - PAGE_SHIFT)) + + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0)); +} + +/* + * Linked list for chaining entries in the shadow hash table. + */ +struct shadow_hash_entry { + struct shadow_hash_entry *next; + mfn_t smfn; /* MFN of the shadow */ +#ifdef _x86_64_ /* Shorten 'n' so we don't waste a whole word on storing 't' */ + unsigned long n:56; /* MFN of guest PT or GFN of guest superpage */ +#else + unsigned long n; /* MFN of guest PT or GFN of guest superpage */ +#endif + unsigned char t; /* shadow type bits, or 0 for empty */ +}; + +#define SHADOW_HASH_BUCKETS 251 +/* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */ + + +#if SHADOW_OPTIMIZATIONS & SHOPT_CACHE_WALKS +/* Optimization: cache the results of guest walks. This helps with MMIO + * and emulated writes, which tend to issue very similar walk requests + * repeatedly. We keep the results of the last few walks, and blow + * away the cache on guest cr3 write, mode change, or page fault. */ + +#define SH_WALK_CACHE_ENTRIES 4 + +/* Rather than cache a guest walk, which would include mapped pointers + * to pages, we cache what a TLB would remember about the walk: the + * permissions and the l1 gfn */ +struct shadow_walk_cache { + unsigned long va; /* The virtual address (or 0 == unused) */ + unsigned long gfn; /* The gfn from the effective l1e */ + u32 permissions; /* The aggregated permission bits */ +}; +#endif + + +/**************************************************************************/ +/* Guest physmap (p2m) support */ + +/* Walk another domain's P2M table, mapping pages as we go */ +extern mfn_t +sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn); + + +/* General conversion function from gfn to mfn */ +static inline mfn_t +sh_gfn_to_mfn(struct domain *d, unsigned long gfn) +{ + if ( !shadow_mode_translate(d) ) + return _mfn(gfn); + else if ( likely(current->domain == d) ) + return _mfn(get_mfn_from_gpfn(gfn)); + else + return sh_gfn_to_mfn_foreign(d, gfn); +} + +// vcpu-specific version of gfn_to_mfn(). This is where we hide the dirty +// little secret that, for hvm guests with paging disabled, nearly all of the +// shadow code actually think that the guest is running on *untranslated* page +// tables (which is actually domain->phys_table). +// +static inline mfn_t +sh_vcpu_gfn_to_mfn(struct vcpu *v, unsigned long gfn) +{ + if ( !shadow_vcpu_mode_translate(v) ) + return _mfn(gfn); + if ( likely(current->domain == v->domain) ) + return _mfn(get_mfn_from_gpfn(gfn)); + return sh_gfn_to_mfn_foreign(v->domain, gfn); +} + +static inline unsigned long +sh_mfn_to_gfn(struct domain *d, mfn_t mfn) +{ + if ( shadow_mode_translate(d) ) + return get_gpfn_from_mfn(mfn_x(mfn)); + else + return mfn_x(mfn); +} + + #endif /* _XEN_SHADOW_H */ @@ -49,7 +633,7 @@ * mode: C * c-set-style: "BSD" * c-basic-offset: 4 - * tab-width: 4 * indent-tabs-mode: nil * End: */ + diff --git a/xen/include/asm-x86/shadow2-multi.h b/xen/include/asm-x86/shadow2-multi.h deleted file mode 100644 index ba3f5287b6..0000000000 --- a/xen/include/asm-x86/shadow2-multi.h +++ /dev/null @@ -1,116 +0,0 @@ -/****************************************************************************** - * arch/x86/shadow2-multi.h - * - * Shadow2 declarations which will be multiply compiled. - * Parts of this code are Copyright (c) 2006 by XenSource Inc. - * Parts of this code are Copyright (c) 2006 by Michael A Fetterman - * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -extern int -SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, SHADOW_LEVELS, GUEST_LEVELS)( - struct vcpu *v, mfn_t gl1mfn, void *new_gl1p, u32 size); -extern int -SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, SHADOW_LEVELS, GUEST_LEVELS)( - struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size); -extern int -SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, SHADOW_LEVELS, GUEST_LEVELS)( - struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size); -extern int -SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, SHADOW_LEVELS, GUEST_LEVELS)( - struct vcpu *v, mfn_t gl3mfn, void *new_gl3p, u32 size); -extern int -SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, SHADOW_LEVELS, GUEST_LEVELS)( - struct vcpu *v, mfn_t gl4mfn, void *new_gl4p, u32 size); - -extern void -SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS)( - struct vcpu *v, mfn_t smfn); -extern void -SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS)( - struct vcpu *v, mfn_t smfn); -extern void -SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS)( - struct vcpu *v, mfn_t smfn); -extern void -SHADOW2_INTERNAL_NAME(sh2_destroy_l4_shadow, SHADOW_LEVELS, GUEST_LEVELS)( - struct vcpu *v, mfn_t smfn); - -extern void -SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows, 3, 3) - (struct vcpu *v, mfn_t smfn); - -extern void -SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v, mfn_t sl2mfn); -extern void -SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v, mfn_t sl3mfn); -extern void -SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v, mfn_t sl4mfn); - -extern int -SHADOW2_INTERNAL_NAME(sh2_remove_write_access, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn); -extern int -SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn); - -extern void -SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v, void *ep, mfn_t smfn); - -extern int -SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn); -extern int -SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn); -extern int -SHADOW2_INTERNAL_NAME(sh2_remove_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn); - -#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES -int -SHADOW2_INTERNAL_NAME(sh2_audit_l1_table, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v, mfn_t sl1mfn, mfn_t x); -int -SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v, mfn_t sl1mfn, mfn_t x); -int -SHADOW2_INTERNAL_NAME(sh2_audit_l2_table, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v, mfn_t sl2mfn, mfn_t x); -int -SHADOW2_INTERNAL_NAME(sh2_audit_l3_table, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v, mfn_t sl3mfn, mfn_t x); -int -SHADOW2_INTERNAL_NAME(sh2_audit_l4_table, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v, mfn_t sl4mfn, mfn_t x); -#endif - -#if SHADOW_LEVELS == GUEST_LEVELS -extern mfn_t -SHADOW2_INTERNAL_NAME(sh2_make_monitor_table, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v); -extern void -SHADOW2_INTERNAL_NAME(sh2_destroy_monitor_table, SHADOW_LEVELS, GUEST_LEVELS) - (struct vcpu *v, mfn_t mmfn); -#endif - -extern struct shadow2_paging_mode -SHADOW2_INTERNAL_NAME(sh2_paging_mode, SHADOW_LEVELS, GUEST_LEVELS); diff --git a/xen/include/asm-x86/shadow2-private.h b/xen/include/asm-x86/shadow2-private.h deleted file mode 100644 index 8637692bba..0000000000 --- a/xen/include/asm-x86/shadow2-private.h +++ /dev/null @@ -1,593 +0,0 @@ -/****************************************************************************** - * arch/x86/shadow2-private.h - * - * Shadow2 code that is private, and does not need to be multiply compiled. - * Parts of this code are Copyright (c) 2006 by XenSource Inc. - * Parts of this code are Copyright (c) 2006 by Michael A Fetterman - * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef _XEN_SHADOW2_PRIVATE_H -#define _XEN_SHADOW2_PRIVATE_H - -// In order to override the definition of mfn_to_page, we make sure page.h has -// been included... -#include -#include -#include -#include - - -/****************************************************************************** - * Definitions for the use of the "available" bits in the shadow PTEs. - * - * Review of the low 12 bits of a shadow page table entry: - * - * in a guest: in a shadow: - * Bit 11: _PAGE_AVAIL2, aka _PAGE_GNTTAB - * Bit 10: _PAGE_AVAIL1 _PAGE_SHADOW_RW ("SW" below) - * Bit 9: _PAGE_AVAIL0 _PAGE_SHADOW_PRESENT ("SP" below) - * Bit 8: _PAGE_GLOBAL _PAGE_SHADOW_MMIO ("MMIO" below), - * aka _PAGE_SHADOW_GUEST_NOT_PRESENT - * Bit 7: _PAGE_PSE, aka _PAGE_PAT - * Bit 6: _PAGE_DIRTY - * Bit 5: _PAGE_ACCESSED - * Bit 4: _PAGE_PCD - * Bit 3: _PAGE_PWT - * Bit 2: _PAGE_USER - * Bit 1: _PAGE_RW ("GW" below) - * Bit 0: _PAGE_PRESENT ("GP" below) - * - * Given a guest entry, as shown below, we can expect the following in the - * corresponding shadow entry: - * - * Guest entry Shadow entry Commentary - * ----------- ---------------- --------------------------------------------- - * Maps - * GP GW IO GP SP GW SW MMIO - * -- -- ---- -- -- -- -- ---- - * - - - 0 0 0 0 0 The guest entry has not yet been shadowed. - * 0 - - 0 0 0 0 1 The guest entry is marked not-present. - * 1 1 no ? 1 ? 1 0 Writable entry in the guest. - * 1 0 no ? 1 0 0 0 Read-only entry in the guest. - * 1 1 yes 0 1 ? 1 1 Writable MMIO mapping in the guest. - * 1 0 yes 0 1 0 0 1 Read-only MMIO mapping in the guest. - * - * Normally, we would expect that GP=1 in the guest to imply GP=1 in the - * shadow, and similarly for GW=1. However, various functionality that may be - * implemented via the shadow can cause GP or GW to be cleared in such cases. - * A & D bit emulation is a prime example of such functionality. - * - * If _PAGE_SHADOW_PRESENT is zero, then the _PAGE_PRESENT bit in that same - * entry will always be zero, too. - - * Bit 11 is used in debug builds as the _PAGE_GNTTAB bit in PV guests. It is - * currently available for random (ab)use in shadow entries. - * - * Bit 8 (the global bit) could be propagated from an HVM guest to the shadow, - * but currently there is no benefit, as the guest's TLB is flushed on every - * transition of CR3 anyway due to the HVM exit/re-entry. - * - * In shadow entries in which the _PAGE_SHADOW_PRESENT is set, bit 8 is used - * as the _PAGE_SHADOW_MMIO bit. In such entries, if _PAGE_SHADOW_MMIO is - * set, then the entry contains the *gfn* directly from the corresponding - * guest entry (not an mfn!!). - * - * Bit 7 is set in a guest L2 to signify a superpage entry. The current - * shadow code splinters superpage mappings into 512 or 1024 4K mappings; the - * resulting shadow L1 table is called an FL1. Note that there is no guest - * page that corresponds to an FL1. - * - * Bit 7 in a guest L1 is the PAT2 bit. Currently we do not support PAT in - * this shadow code. - * - * Bit 6 is the dirty bit. - * - * Bit 5 is the accessed bit. - * - * Bit 4 is the cache disable bit. If set in a guest, the hardware is - * supposed to refuse to cache anything found via this entry. It can be set - * in an L4e, L3e, L2e, or L1e. This shadow code currently does not support - * cache disable bits. They are silently ignored. - * - * Bit 4 is a guest L1 is also the PAT1 bit. Currently we do not support PAT - * in this shadow code. - * - * Bit 3 is the cache write-thru bit. If set in a guest, the hardware is - * supposed to use write-thru instead of write-back caching for anything found - * via this entry. It can be set in an L4e, L3e, L2e, or L1e. This shadow - * code currently does not support cache write-thru bits. They are silently - * ignored. - * - * Bit 3 is a guest L1 is also the PAT0 bit. Currently we do not support PAT - * in this shadow code. - * - * Bit 2 is the user bit. - * - * Bit 1 is the read-write bit. - * - * Bit 0 is the present bit. - */ - -// Copy of the _PAGE_RW bit from the guest's PTE, appropriately zero'ed by -// the appropriate shadow rules. -#define _PAGE_SHADOW_RW _PAGE_AVAIL1 - -// Copy of the _PAGE_PRESENT bit from the guest's PTE -#define _PAGE_SHADOW_PRESENT _PAGE_AVAIL0 - -// The matching guest entry maps MMIO space -#define _PAGE_SHADOW_MMIO _PAGE_GLOBAL - -// Shadow flags value used when the guest is not present -#define _PAGE_SHADOW_GUEST_NOT_PRESENT _PAGE_GLOBAL - - -/****************************************************************************** - * Debug and error-message output - */ -#define SHADOW2_PRINTK(_f, _a...) \ - debugtrace_printk("sh2: %s(): " _f, __func__, ##_a) -#define SHADOW2_ERROR(_f, _a...) \ - printk("sh2 error: %s(): " _f, __func__, ##_a) -#define SHADOW2_DEBUG(flag, _f, _a...) \ - do { \ - if (SHADOW2_DEBUG_ ## flag) \ - debugtrace_printk("sh2debug: %s(): " _f, __func__, ##_a); \ - } while (0) - -// The flags for use with SHADOW2_DEBUG: -#define SHADOW2_DEBUG_PROPAGATE 0 -#define SHADOW2_DEBUG_MAKE_SHADOW 0 -#define SHADOW2_DEBUG_DESTROY_SHADOW 0 -#define SHADOW2_DEBUG_P2M 0 -#define SHADOW2_DEBUG_A_AND_D 0 -#define SHADOW2_DEBUG_EMULATE 0 -#define SHADOW2_DEBUG_LOGDIRTY 1 - - -/****************************************************************************** - * Auditing routines - */ - -#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL -extern void shadow2_audit_tables(struct vcpu *v); -#else -#define shadow2_audit_tables(_v) do {} while(0) -#endif - -#if SHADOW2_AUDIT & SHADOW2_AUDIT_P2M -extern void shadow2_audit_p2m(struct domain *d); -#else -#define shadow2_audit_p2m(_d) do {} while(0) -#endif - - -/****************************************************************************** - * Mechanism for double-checking the optimized pagefault path: this - * structure contains a record of actions taken by the fault handling - * code. In paranoid mode, the fast-path code fills out one of these - * structures (but doesn't take any actual action) and then the normal - * path fills in another. When the fault handler finishes, the - * two are compared */ - -#ifdef SHADOW2_OPTIMIZATION_PARANOIA - -typedef struct shadow2_action_log sh2_log_t; -struct shadow2_action_log { - paddr_t ad[CONFIG_PAGING_LEVELS]; /* A & D bits propagated here */ - paddr_t mmio; /* Address of an mmio operation */ - int rv; /* Result of the fault handler */ -}; - -/* There are two logs, one for the fast path, one for the normal path */ -enum sh2_log_type { log_slow = 0, log_fast= 1 }; - -/* Alloc and zero the logs */ -static inline void sh2_init_log(struct vcpu *v) -{ - if ( unlikely(!v->arch.shadow2.action_log) ) - v->arch.shadow2.action_log = xmalloc_array(sh2_log_t, 2); - ASSERT(v->arch.shadow2.action_log); - memset(v->arch.shadow2.action_log, 0, 2 * sizeof (sh2_log_t)); -} - -/* Log an A&D-bit update */ -static inline void sh2_log_ad(struct vcpu *v, paddr_t e, unsigned int level) -{ - v->arch.shadow2.action_log[v->arch.shadow2.action_index].ad[level] = e; -} - -/* Log an MMIO address */ -static inline void sh2_log_mmio(struct vcpu *v, paddr_t m) -{ - v->arch.shadow2.action_log[v->arch.shadow2.action_index].mmio = m; -} - -/* Log the result */ -static inline void sh2_log_rv(struct vcpu *v, int rv) -{ - v->arch.shadow2.action_log[v->arch.shadow2.action_index].rv = rv; -} - -/* Set which mode we're in */ -static inline void sh2_set_log_mode(struct vcpu *v, enum sh2_log_type t) -{ - v->arch.shadow2.action_index = t; -} - -/* Know not to take action, because we're only checking the mechanism */ -static inline int sh2_take_no_action(struct vcpu *v) -{ - return (v->arch.shadow2.action_index == log_fast); -} - -#else /* Non-paranoid mode: these logs do not exist */ - -#define sh2_init_log(_v) do { (void)(_v); } while(0) -#define sh2_set_log_mode(_v,_t) do { (void)(_v); } while(0) -#define sh2_log_ad(_v,_e,_l) do { (void)(_v),(void)(_e),(void)(_l); } while (0) -#define sh2_log_mmio(_v,_m) do { (void)(_v),(void)(_m); } while (0) -#define sh2_log_rv(_v,_r) do { (void)(_v),(void)(_r); } while (0) -#define sh2_take_no_action(_v) (((void)(_v)), 0) - -#endif /* SHADOW2_OPTIMIZATION_PARANOIA */ - - -/****************************************************************************** - * Macro for dealing with the naming of the internal names of the - * shadow code's external entry points. - */ -#define SHADOW2_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels) \ - name ## __shadow_ ## shadow_levels ## _guest_ ## guest_levels -#define SHADOW2_INTERNAL_NAME(name, shadow_levels, guest_levels) \ - SHADOW2_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels) - -#if CONFIG_PAGING_LEVELS == 2 -#define GUEST_LEVELS 2 -#define SHADOW_LEVELS 2 -#include -#undef GUEST_LEVELS -#undef SHADOW_LEVELS -#endif /* CONFIG_PAGING_LEVELS == 2 */ - -#if CONFIG_PAGING_LEVELS == 3 -#define GUEST_LEVELS 2 -#define SHADOW_LEVELS 3 -#include -#undef GUEST_LEVELS -#undef SHADOW_LEVELS - -#define GUEST_LEVELS 3 -#define SHADOW_LEVELS 3 -#include -#undef GUEST_LEVELS -#undef SHADOW_LEVELS -#endif /* CONFIG_PAGING_LEVELS == 3 */ - -#if CONFIG_PAGING_LEVELS == 4 -#define GUEST_LEVELS 2 -#define SHADOW_LEVELS 3 -#include -#undef GUEST_LEVELS -#undef SHADOW_LEVELS - -#define GUEST_LEVELS 3 -#define SHADOW_LEVELS 3 -#include -#undef GUEST_LEVELS -#undef SHADOW_LEVELS - -#define GUEST_LEVELS 3 -#define SHADOW_LEVELS 4 -#include -#undef GUEST_LEVELS -#undef SHADOW_LEVELS - -#define GUEST_LEVELS 4 -#define SHADOW_LEVELS 4 -#include -#undef GUEST_LEVELS -#undef SHADOW_LEVELS -#endif /* CONFIG_PAGING_LEVELS == 4 */ - - -/****************************************************************************** - * Various function declarations - */ - -/* x86 emulator support */ -extern struct x86_emulate_ops shadow2_emulator_ops; - -/* Hash table functions */ -mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t); -void shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn); -void shadow2_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn); - -/* shadow promotion */ -void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type); -void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type); - -/* Shadow page allocation functions */ -void shadow2_prealloc(struct domain *d, unsigned int order); -mfn_t shadow2_alloc(struct domain *d, - u32 shadow_type, - unsigned long backpointer); -void shadow2_free(struct domain *d, mfn_t smfn); - -/* Function to convert a shadow to log-dirty */ -void shadow2_convert_to_log_dirty(struct vcpu *v, mfn_t smfn); - -/* Dispatcher function: call the per-mode function that will unhook the - * non-Xen mappings in this top-level shadow mfn */ -void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn); - -/* Re-sync copies of PAE shadow L3 tables if they have been changed */ -void sh2_pae_recopy(struct domain *d); - -/* Install the xen mappings in various flavours of shadow */ -void sh2_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn); -void sh2_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn); -void sh2_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn); -void sh2_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn); - - -/****************************************************************************** - * MFN/page-info handling - */ - -// Override mfn_to_page from asm/page.h, which was #include'd above, -// in order to make it work with our mfn type. -#undef mfn_to_page -#define mfn_to_page(_mfn) (frame_table + mfn_x(_mfn)) - -// Override page_to_mfn from asm/page.h, which was #include'd above, -// in order to make it work with our mfn type. -#undef page_to_mfn -#define page_to_mfn(_pg) (_mfn((_pg) - frame_table)) - -// Override mfn_valid from asm/page.h, which was #include'd above, -// in order to make it work with our mfn type. -#undef mfn_valid -#define mfn_valid(_mfn) (mfn_x(_mfn) < max_page) - -// Provide mfn_t-aware versions of common xen functions -static inline void * -sh2_map_domain_page(mfn_t mfn) -{ - /* XXX Using the monitor-table as a map will happen here */ - return map_domain_page(mfn_x(mfn)); -} - -static inline void -sh2_unmap_domain_page(void *p) -{ - /* XXX Using the monitor-table as a map will happen here */ - unmap_domain_page(p); -} - -static inline void * -sh2_map_domain_page_global(mfn_t mfn) -{ - /* XXX Using the monitor-table as a map will happen here */ - return map_domain_page_global(mfn_x(mfn)); -} - -static inline void -sh2_unmap_domain_page_global(void *p) -{ - /* XXX Using the monitor-table as a map will happen here */ - unmap_domain_page_global(p); -} - -static inline int -sh2_mfn_is_dirty(struct domain *d, mfn_t gmfn) -/* Is this guest page dirty? Call only in log-dirty mode. */ -{ - unsigned long pfn; - ASSERT(shadow2_mode_log_dirty(d)); - ASSERT(d->arch.shadow2.dirty_bitmap != NULL); - - /* We /really/ mean PFN here, even for non-translated guests. */ - pfn = get_gpfn_from_mfn(mfn_x(gmfn)); - if ( likely(VALID_M2P(pfn)) - && likely(pfn < d->arch.shadow2.dirty_bitmap_size) - && test_bit(pfn, d->arch.shadow2.dirty_bitmap) ) - return 1; - - return 0; -} - -static inline int -sh2_mfn_is_a_page_table(mfn_t gmfn) -{ - struct page_info *page = mfn_to_page(gmfn); - struct domain *owner; - unsigned long type_info; - - if ( !valid_mfn(gmfn) ) - return 0; - - owner = page_get_owner(page); - if ( owner && shadow2_mode_refcounts(owner) - && (page->count_info & PGC_page_table) ) - return 1; - - type_info = page->u.inuse.type_info & PGT_type_mask; - return type_info && (type_info <= PGT_l4_page_table); -} - - -/**************************************************************************/ -/* Shadow-page refcounting. See comment in shadow2-common.c about the - * use of struct page_info fields for shadow pages */ - -void sh2_destroy_shadow(struct vcpu *v, mfn_t smfn); - -/* Increase the refcount of a shadow page. Arguments are the mfn to refcount, - * and the physical address of the shadow entry that holds the ref (or zero - * if the ref is held by something else) */ -static inline void sh2_get_ref(mfn_t smfn, paddr_t entry_pa) -{ - u32 x, nx; - struct page_info *page = mfn_to_page(smfn); - - ASSERT(mfn_valid(smfn)); - - x = page->count_info & PGC_SH2_count_mask; - nx = x + 1; - - if ( unlikely(nx & ~PGC_SH2_count_mask) ) - { - SHADOW2_PRINTK("shadow ref overflow, gmfn=%" PRtype_info " smfn=%lx\n", - page->u.inuse.type_info, mfn_x(smfn)); - domain_crash_synchronous(); - } - - /* Guarded by the shadow lock, so no need for atomic update */ - page->count_info &= ~PGC_SH2_count_mask; - page->count_info |= nx; - - /* We remember the first shadow entry that points to each shadow. */ - if ( entry_pa != 0 && page->up == 0 ) - page->up = entry_pa; -} - - -/* Decrease the refcount of a shadow page. As for get_ref, takes the - * physical address of the shadow entry that held this reference. */ -static inline void sh2_put_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa) -{ - u32 x, nx; - struct page_info *page = mfn_to_page(smfn); - - ASSERT(mfn_valid(smfn)); - ASSERT(page_get_owner(page) == NULL); - - /* If this is the entry in the up-pointer, remove it */ - if ( entry_pa != 0 && page->up == entry_pa ) - page->up = 0; - - x = page->count_info & PGC_SH2_count_mask; - nx = x - 1; - - if ( unlikely(x == 0) ) - { - SHADOW2_PRINTK("shadow ref underflow, smfn=%lx oc=%08x t=%" - PRtype_info "\n", - mfn_x(smfn), - page->count_info & PGC_SH2_count_mask, - page->u.inuse.type_info); - domain_crash_synchronous(); - } - - /* Guarded by the shadow lock, so no need for atomic update */ - page->count_info &= ~PGC_SH2_count_mask; - page->count_info |= nx; - - if ( unlikely(nx == 0) ) - sh2_destroy_shadow(v, smfn); -} - - -/* Pin a shadow page: take an extra refcount and set the pin bit. */ -static inline void sh2_pin(mfn_t smfn) -{ - struct page_info *page; - - ASSERT(mfn_valid(smfn)); - page = mfn_to_page(smfn); - if ( !(page->count_info & PGC_SH2_pinned) ) - { - sh2_get_ref(smfn, 0); - page->count_info |= PGC_SH2_pinned; - } -} - -/* Unpin a shadow page: unset the pin bit and release the extra ref. */ -static inline void sh2_unpin(struct vcpu *v, mfn_t smfn) -{ - struct page_info *page; - - ASSERT(mfn_valid(smfn)); - page = mfn_to_page(smfn); - if ( page->count_info & PGC_SH2_pinned ) - { - page->count_info &= ~PGC_SH2_pinned; - sh2_put_ref(v, smfn, 0); - } -} - -/**************************************************************************/ -/* Guest physmap (p2m) support */ - -/* Read our own P2M table, checking in the linear pagetables first to be - * sure that we will succeed. Call this function if you expect it to - * fail often, as it avoids page faults. If you expect to succeed, use - * vcpu_gfn_to_mfn, which copy_from_user()s the entry */ -static inline mfn_t -vcpu_gfn_to_mfn_nofault(struct vcpu *v, unsigned long gfn) -{ - unsigned long entry_addr = (unsigned long) &phys_to_machine_mapping[gfn]; -#if CONFIG_PAGING_LEVELS >= 4 - l4_pgentry_t *l4e; - l3_pgentry_t *l3e; -#endif - l2_pgentry_t *l2e; - l1_pgentry_t *l1e; - - ASSERT(current == v); - if ( !shadow2_vcpu_mode_translate(v) ) - return _mfn(gfn); - -#if CONFIG_PAGING_LEVELS > 2 - if ( gfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) - /* This pfn is higher than the p2m map can hold */ - return _mfn(INVALID_MFN); -#endif - - /* Walk the linear pagetables. Note that this is *not* the same as - * the walk in sh2_gfn_to_mfn_foreign, which is walking the p2m map */ -#if CONFIG_PAGING_LEVELS >= 4 - l4e = __linear_l4_table + l4_linear_offset(entry_addr); - if ( !(l4e_get_flags(*l4e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); - l3e = __linear_l3_table + l3_linear_offset(entry_addr); - if ( !(l3e_get_flags(*l3e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); -#endif - l2e = __linear_l2_table + l2_linear_offset(entry_addr); - if ( !(l2e_get_flags(*l2e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); - l1e = __linear_l1_table + l1_linear_offset(entry_addr); - if ( !(l1e_get_flags(*l1e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); - - /* Safe to look at this part of the table */ - if ( l1e_get_flags(phys_to_machine_mapping[gfn]) & _PAGE_PRESENT ) - return _mfn(l1e_get_pfn(phys_to_machine_mapping[gfn])); - - return _mfn(INVALID_MFN); -} - - -#endif /* _XEN_SHADOW2_PRIVATE_H */ - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/xen/include/asm-x86/shadow2-types.h b/xen/include/asm-x86/shadow2-types.h deleted file mode 100644 index 13107f4566..0000000000 --- a/xen/include/asm-x86/shadow2-types.h +++ /dev/null @@ -1,692 +0,0 @@ -/****************************************************************************** - * include/asm-x86/shadow2-types.h - * - * Parts of this code are Copyright (c) 2006 by XenSource Inc. - * Parts of this code are Copyright (c) 2006 by Michael A Fetterman - * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef _XEN_SHADOW2_TYPES_H -#define _XEN_SHADOW2_TYPES_H - -// Map a shadow page -static inline void * -map_shadow_page(mfn_t smfn) -{ - // XXX -- Possible optimization/measurement question for 32-bit and PAE - // hypervisors: - // How often is this smfn already available in the shadow linear - // table? Might it be worth checking that table first, - // presumably using the reverse map hint in the page_info of this - // smfn, rather than calling map_domain_page()? - // - return sh2_map_domain_page(smfn); -} - -// matching unmap for map_shadow_page() -static inline void -unmap_shadow_page(void *p) -{ - sh2_unmap_domain_page(p); -} - -/* - * Define various types for handling pagetabels, based on these options: - * SHADOW_PAGING_LEVELS : Number of levels of shadow pagetables - * GUEST_PAGING_LEVELS : Number of levels of guest pagetables - */ - -#if (CONFIG_PAGING_LEVELS < SHADOW_PAGING_LEVELS) -#error Cannot have more levels of shadow pagetables than host pagetables -#endif - -#if (SHADOW_PAGING_LEVELS < GUEST_PAGING_LEVELS) -#error Cannot have more levels of guest pagetables than shadow pagetables -#endif - -#if SHADOW_PAGING_LEVELS == 2 -#define SHADOW_L1_PAGETABLE_ENTRIES 1024 -#define SHADOW_L2_PAGETABLE_ENTRIES 1024 -#define SHADOW_L1_PAGETABLE_SHIFT 12 -#define SHADOW_L2_PAGETABLE_SHIFT 22 -#endif - -#if SHADOW_PAGING_LEVELS == 3 -#define SHADOW_L1_PAGETABLE_ENTRIES 512 -#define SHADOW_L2_PAGETABLE_ENTRIES 512 -#define SHADOW_L3_PAGETABLE_ENTRIES 4 -#define SHADOW_L1_PAGETABLE_SHIFT 12 -#define SHADOW_L2_PAGETABLE_SHIFT 21 -#define SHADOW_L3_PAGETABLE_SHIFT 30 -#endif - -#if SHADOW_PAGING_LEVELS == 4 -#define SHADOW_L1_PAGETABLE_ENTRIES 512 -#define SHADOW_L2_PAGETABLE_ENTRIES 512 -#define SHADOW_L3_PAGETABLE_ENTRIES 512 -#define SHADOW_L4_PAGETABLE_ENTRIES 512 -#define SHADOW_L1_PAGETABLE_SHIFT 12 -#define SHADOW_L2_PAGETABLE_SHIFT 21 -#define SHADOW_L3_PAGETABLE_SHIFT 30 -#define SHADOW_L4_PAGETABLE_SHIFT 39 -#endif - -/* Types of the shadow page tables */ -typedef l1_pgentry_t shadow_l1e_t; -typedef l2_pgentry_t shadow_l2e_t; -#if SHADOW_PAGING_LEVELS >= 3 -typedef l3_pgentry_t shadow_l3e_t; -#if SHADOW_PAGING_LEVELS >= 4 -typedef l4_pgentry_t shadow_l4e_t; -#endif -#endif - -/* Access functions for them */ -static inline paddr_t shadow_l1e_get_paddr(shadow_l1e_t sl1e) -{ return l1e_get_paddr(sl1e); } -static inline paddr_t shadow_l2e_get_paddr(shadow_l2e_t sl2e) -{ return l2e_get_paddr(sl2e); } -#if SHADOW_PAGING_LEVELS >= 3 -static inline paddr_t shadow_l3e_get_paddr(shadow_l3e_t sl3e) -{ return l3e_get_paddr(sl3e); } -#if SHADOW_PAGING_LEVELS >= 4 -static inline paddr_t shadow_l4e_get_paddr(shadow_l4e_t sl4e) -{ return l4e_get_paddr(sl4e); } -#endif -#endif - -static inline mfn_t shadow_l1e_get_mfn(shadow_l1e_t sl1e) -{ return _mfn(l1e_get_pfn(sl1e)); } -static inline mfn_t shadow_l2e_get_mfn(shadow_l2e_t sl2e) -{ return _mfn(l2e_get_pfn(sl2e)); } -#if SHADOW_PAGING_LEVELS >= 3 -static inline mfn_t shadow_l3e_get_mfn(shadow_l3e_t sl3e) -{ return _mfn(l3e_get_pfn(sl3e)); } -#if SHADOW_PAGING_LEVELS >= 4 -static inline mfn_t shadow_l4e_get_mfn(shadow_l4e_t sl4e) -{ return _mfn(l4e_get_pfn(sl4e)); } -#endif -#endif - -static inline u32 shadow_l1e_get_flags(shadow_l1e_t sl1e) -{ return l1e_get_flags(sl1e); } -static inline u32 shadow_l2e_get_flags(shadow_l2e_t sl2e) -{ return l2e_get_flags(sl2e); } -#if SHADOW_PAGING_LEVELS >= 3 -static inline u32 shadow_l3e_get_flags(shadow_l3e_t sl3e) -{ return l3e_get_flags(sl3e); } -#if SHADOW_PAGING_LEVELS >= 4 -static inline u32 shadow_l4e_get_flags(shadow_l4e_t sl4e) -{ return l4e_get_flags(sl4e); } -#endif -#endif - -static inline shadow_l1e_t -shadow_l1e_remove_flags(shadow_l1e_t sl1e, u32 flags) -{ l1e_remove_flags(sl1e, flags); return sl1e; } - -static inline shadow_l1e_t shadow_l1e_empty(void) -{ return l1e_empty(); } -static inline shadow_l2e_t shadow_l2e_empty(void) -{ return l2e_empty(); } -#if SHADOW_PAGING_LEVELS >= 3 -static inline shadow_l3e_t shadow_l3e_empty(void) -{ return l3e_empty(); } -#if SHADOW_PAGING_LEVELS >= 4 -static inline shadow_l4e_t shadow_l4e_empty(void) -{ return l4e_empty(); } -#endif -#endif - -static inline shadow_l1e_t shadow_l1e_from_mfn(mfn_t mfn, u32 flags) -{ return l1e_from_pfn(mfn_x(mfn), flags); } -static inline shadow_l2e_t shadow_l2e_from_mfn(mfn_t mfn, u32 flags) -{ return l2e_from_pfn(mfn_x(mfn), flags); } -#if SHADOW_PAGING_LEVELS >= 3 -static inline shadow_l3e_t shadow_l3e_from_mfn(mfn_t mfn, u32 flags) -{ return l3e_from_pfn(mfn_x(mfn), flags); } -#if SHADOW_PAGING_LEVELS >= 4 -static inline shadow_l4e_t shadow_l4e_from_mfn(mfn_t mfn, u32 flags) -{ return l4e_from_pfn(mfn_x(mfn), flags); } -#endif -#endif - -#define shadow_l1_table_offset(a) l1_table_offset(a) -#define shadow_l2_table_offset(a) l2_table_offset(a) -#define shadow_l3_table_offset(a) l3_table_offset(a) -#define shadow_l4_table_offset(a) l4_table_offset(a) - -/**************************************************************************/ -/* Access to the linear mapping of shadow page tables. */ - -/* Offsets into each level of the linear mapping for a virtual address. */ -#define shadow_l1_linear_offset(_a) \ - (((_a) & VADDR_MASK) >> SHADOW_L1_PAGETABLE_SHIFT) -#define shadow_l2_linear_offset(_a) \ - (((_a) & VADDR_MASK) >> SHADOW_L2_PAGETABLE_SHIFT) -#define shadow_l3_linear_offset(_a) \ - (((_a) & VADDR_MASK) >> SHADOW_L3_PAGETABLE_SHIFT) -#define shadow_l4_linear_offset(_a) \ - (((_a) & VADDR_MASK) >> SHADOW_L4_PAGETABLE_SHIFT) - -/* Where to find each level of the linear mapping. For PV guests, we use - * the shadow linear-map self-entry as many times as we need. For HVM - * guests, the shadow doesn't have a linear-map self-entry so we must use - * the monitor-table's linear-map entry N-1 times and then the shadow-map - * entry once. */ -#define __sh2_linear_l1_table ((shadow_l1e_t *)(SH_LINEAR_PT_VIRT_START)) -#define __sh2_linear_l2_table ((shadow_l2e_t *) \ - (__sh2_linear_l1_table + shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START))) - -// shadow linear L3 and L4 tables only exist in 4 level paging... -#if SHADOW_PAGING_LEVELS == 4 -#define __sh2_linear_l3_table ((shadow_l3e_t *) \ - (__sh2_linear_l2_table + shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START))) -#define __sh2_linear_l4_table ((shadow_l4e_t *) \ - (__sh2_linear_l3_table + shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START))) -#endif - -#define sh2_linear_l1_table(v) ({ \ - ASSERT(current == (v)); \ - __sh2_linear_l1_table; \ -}) - -#define sh2_linear_l2_table(v) ({ \ - ASSERT(current == (v)); \ - ((shadow_l2e_t *) \ - (hvm_guest(v) ? __linear_l1_table : __sh2_linear_l1_table) + \ - shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START)); \ -}) - -// shadow linear L3 and L4 tables only exist in 4 level paging... -#if SHADOW_PAGING_LEVELS == 4 -#define sh2_linear_l3_table(v) ({ \ - ASSERT(current == (v)); \ - ((shadow_l3e_t *) \ - (hvm_guest(v) ? __linear_l2_table : __sh2_linear_l2_table) + \ - shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START)); \ -}) - -// we use l4_pgentry_t instead of shadow_l4e_t below because shadow_l4e_t is -// not defined for when xen_levels==4 & shadow_levels==3... -#define sh2_linear_l4_table(v) ({ \ - ASSERT(current == (v)); \ - ((l4_pgentry_t *) \ - (hvm_guest(v) ? __linear_l3_table : __sh2_linear_l3_table) + \ - shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START)); \ -}) -#endif - -#if GUEST_PAGING_LEVELS == 2 - -#include - -#define GUEST_L1_PAGETABLE_ENTRIES 1024 -#define GUEST_L2_PAGETABLE_ENTRIES 1024 -#define GUEST_L1_PAGETABLE_SHIFT 12 -#define GUEST_L2_PAGETABLE_SHIFT 22 - -/* Type of the guest's frame numbers */ -TYPE_SAFE(u32,gfn) -#define INVALID_GFN ((u32)(-1u)) -#define SH2_PRI_gfn "05x" - -/* Types of the guest's page tables */ -typedef l1_pgentry_32_t guest_l1e_t; -typedef l2_pgentry_32_t guest_l2e_t; - -/* Access functions for them */ -static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e) -{ return l1e_get_paddr_32(gl1e); } -static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e) -{ return l2e_get_paddr_32(gl2e); } - -static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e) -{ return _gfn(l1e_get_paddr_32(gl1e) >> PAGE_SHIFT); } -static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e) -{ return _gfn(l2e_get_paddr_32(gl2e) >> PAGE_SHIFT); } - -static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e) -{ return l1e_get_flags_32(gl1e); } -static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e) -{ return l2e_get_flags_32(gl2e); } - -static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags) -{ l1e_add_flags_32(gl1e, flags); return gl1e; } -static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags) -{ l2e_add_flags_32(gl2e, flags); return gl2e; } - -static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags) -{ return l1e_from_pfn_32(gfn_x(gfn), flags); } -static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags) -{ return l2e_from_pfn_32(gfn_x(gfn), flags); } - -#define guest_l1_table_offset(a) l1_table_offset_32(a) -#define guest_l2_table_offset(a) l2_table_offset_32(a) - -/* The shadow types needed for the various levels. */ -#define PGC_SH2_l1_shadow PGC_SH2_l1_32_shadow -#define PGC_SH2_l2_shadow PGC_SH2_l2_32_shadow -#define PGC_SH2_fl1_shadow PGC_SH2_fl1_32_shadow - -#else /* GUEST_PAGING_LEVELS != 2 */ - -#if GUEST_PAGING_LEVELS == 3 -#define GUEST_L1_PAGETABLE_ENTRIES 512 -#define GUEST_L2_PAGETABLE_ENTRIES 512 -#define GUEST_L3_PAGETABLE_ENTRIES 4 -#define GUEST_L1_PAGETABLE_SHIFT 12 -#define GUEST_L2_PAGETABLE_SHIFT 21 -#define GUEST_L3_PAGETABLE_SHIFT 30 -#else /* GUEST_PAGING_LEVELS == 4 */ -#define GUEST_L1_PAGETABLE_ENTRIES 512 -#define GUEST_L2_PAGETABLE_ENTRIES 512 -#define GUEST_L3_PAGETABLE_ENTRIES 512 -#define GUEST_L4_PAGETABLE_ENTRIES 512 -#define GUEST_L1_PAGETABLE_SHIFT 12 -#define GUEST_L2_PAGETABLE_SHIFT 21 -#define GUEST_L3_PAGETABLE_SHIFT 30 -#define GUEST_L4_PAGETABLE_SHIFT 39 -#endif - -/* Type of the guest's frame numbers */ -TYPE_SAFE(unsigned long,gfn) -#define INVALID_GFN ((unsigned long)(-1ul)) -#define SH2_PRI_gfn "05lx" - -/* Types of the guest's page tables */ -typedef l1_pgentry_t guest_l1e_t; -typedef l2_pgentry_t guest_l2e_t; -typedef l3_pgentry_t guest_l3e_t; -#if GUEST_PAGING_LEVELS >= 4 -typedef l4_pgentry_t guest_l4e_t; -#endif - -/* Access functions for them */ -static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e) -{ return l1e_get_paddr(gl1e); } -static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e) -{ return l2e_get_paddr(gl2e); } -static inline paddr_t guest_l3e_get_paddr(guest_l3e_t gl3e) -{ return l3e_get_paddr(gl3e); } -#if GUEST_PAGING_LEVELS >= 4 -static inline paddr_t guest_l4e_get_paddr(guest_l4e_t gl4e) -{ return l4e_get_paddr(gl4e); } -#endif - -static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e) -{ return _gfn(l1e_get_paddr(gl1e) >> PAGE_SHIFT); } -static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e) -{ return _gfn(l2e_get_paddr(gl2e) >> PAGE_SHIFT); } -static inline gfn_t guest_l3e_get_gfn(guest_l3e_t gl3e) -{ return _gfn(l3e_get_paddr(gl3e) >> PAGE_SHIFT); } -#if GUEST_PAGING_LEVELS >= 4 -static inline gfn_t guest_l4e_get_gfn(guest_l4e_t gl4e) -{ return _gfn(l4e_get_paddr(gl4e) >> PAGE_SHIFT); } -#endif - -static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e) -{ return l1e_get_flags(gl1e); } -static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e) -{ return l2e_get_flags(gl2e); } -static inline u32 guest_l3e_get_flags(guest_l3e_t gl3e) -{ return l3e_get_flags(gl3e); } -#if GUEST_PAGING_LEVELS >= 4 -static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e) -{ return l4e_get_flags(gl4e); } -#endif - -static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags) -{ l1e_add_flags(gl1e, flags); return gl1e; } -static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags) -{ l2e_add_flags(gl2e, flags); return gl2e; } -static inline guest_l3e_t guest_l3e_add_flags(guest_l3e_t gl3e, u32 flags) -{ l3e_add_flags(gl3e, flags); return gl3e; } -#if GUEST_PAGING_LEVELS >= 4 -static inline guest_l4e_t guest_l4e_add_flags(guest_l4e_t gl4e, u32 flags) -{ l4e_add_flags(gl4e, flags); return gl4e; } -#endif - -static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags) -{ return l1e_from_pfn(gfn_x(gfn), flags); } -static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags) -{ return l2e_from_pfn(gfn_x(gfn), flags); } -static inline guest_l3e_t guest_l3e_from_gfn(gfn_t gfn, u32 flags) -{ return l3e_from_pfn(gfn_x(gfn), flags); } -#if GUEST_PAGING_LEVELS >= 4 -static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags) -{ return l4e_from_pfn(gfn_x(gfn), flags); } -#endif - -#define guest_l1_table_offset(a) l1_table_offset(a) -#define guest_l2_table_offset(a) l2_table_offset(a) -#define guest_l3_table_offset(a) l3_table_offset(a) -#define guest_l4_table_offset(a) l4_table_offset(a) - -/* The shadow types needed for the various levels. */ -#if GUEST_PAGING_LEVELS == 3 -#define PGC_SH2_l1_shadow PGC_SH2_l1_pae_shadow -#define PGC_SH2_fl1_shadow PGC_SH2_fl1_pae_shadow -#define PGC_SH2_l2_shadow PGC_SH2_l2_pae_shadow -#define PGC_SH2_l2h_shadow PGC_SH2_l2h_pae_shadow -#define PGC_SH2_l3_shadow PGC_SH2_l3_pae_shadow -#else -#define PGC_SH2_l1_shadow PGC_SH2_l1_64_shadow -#define PGC_SH2_fl1_shadow PGC_SH2_fl1_64_shadow -#define PGC_SH2_l2_shadow PGC_SH2_l2_64_shadow -#define PGC_SH2_l3_shadow PGC_SH2_l3_64_shadow -#define PGC_SH2_l4_shadow PGC_SH2_l4_64_shadow -#endif - -#endif /* GUEST_PAGING_LEVELS != 2 */ - -#define VALID_GFN(m) (m != INVALID_GFN) - -static inline int -valid_gfn(gfn_t m) -{ - return VALID_GFN(gfn_x(m)); -} - -#if GUEST_PAGING_LEVELS == 2 -#define PGC_SH2_guest_root_type PGC_SH2_l2_32_shadow -#elif GUEST_PAGING_LEVELS == 3 -#define PGC_SH2_guest_root_type PGC_SH2_l3_pae_shadow -#else -#define PGC_SH2_guest_root_type PGC_SH2_l4_64_shadow -#endif - -/* Translation between mfns and gfns */ -static inline mfn_t -vcpu_gfn_to_mfn(struct vcpu *v, gfn_t gfn) -{ - return sh2_vcpu_gfn_to_mfn(v, gfn_x(gfn)); -} - -static inline gfn_t -mfn_to_gfn(struct domain *d, mfn_t mfn) -{ - return _gfn(sh2_mfn_to_gfn(d, mfn)); -} - -static inline paddr_t -gfn_to_paddr(gfn_t gfn) -{ - return ((paddr_t)gfn_x(gfn)) << PAGE_SHIFT; -} - -/* Type used for recording a walk through guest pagetables. It is - * filled in by the pagetable walk function, and also used as a cache - * for later walks. - * Any non-null pointer in this structure represents a mapping of guest - * memory. We must always call walk_init() before using a walk_t, and - * call walk_unmap() when we're done. - * The "Effective l1e" field is used when there isn't an l1e to point to, - * but we have fabricated an l1e for propagation to the shadow (e.g., - * for splintering guest superpages into many shadow l1 entries). */ -typedef struct shadow2_walk_t walk_t; -struct shadow2_walk_t -{ - unsigned long va; /* Address we were looking for */ -#if GUEST_PAGING_LEVELS >= 3 -#if GUEST_PAGING_LEVELS >= 4 - guest_l4e_t *l4e; /* Pointer to guest's level 4 entry */ -#endif - guest_l3e_t *l3e; /* Pointer to guest's level 3 entry */ -#endif - guest_l2e_t *l2e; /* Pointer to guest's level 2 entry */ - guest_l1e_t *l1e; /* Pointer to guest's level 1 entry */ - guest_l1e_t eff_l1e; /* Effective level 1 entry */ -#if GUEST_PAGING_LEVELS >= 3 -#if GUEST_PAGING_LEVELS >= 4 - mfn_t l4mfn; /* MFN that the level 4 entry is in */ -#endif - mfn_t l3mfn; /* MFN that the level 3 entry is in */ -#endif - mfn_t l2mfn; /* MFN that the level 2 entry is in */ - mfn_t l1mfn; /* MFN that the level 1 entry is in */ -}; - -/* macros for dealing with the naming of the internal function names of the - * shadow code's external entry points. - */ -#define INTERNAL_NAME(name) \ - SHADOW2_INTERNAL_NAME(name, SHADOW_PAGING_LEVELS, GUEST_PAGING_LEVELS) - -/* macros for renaming the primary entry points, so that they are more - * easily distinguished from a debugger - */ -#define sh2_page_fault INTERNAL_NAME(sh2_page_fault) -#define sh2_invlpg INTERNAL_NAME(sh2_invlpg) -#define sh2_gva_to_gpa INTERNAL_NAME(sh2_gva_to_gpa) -#define sh2_gva_to_gfn INTERNAL_NAME(sh2_gva_to_gfn) -#define sh2_update_cr3 INTERNAL_NAME(sh2_update_cr3) -#define sh2_remove_write_access INTERNAL_NAME(sh2_remove_write_access) -#define sh2_remove_all_mappings INTERNAL_NAME(sh2_remove_all_mappings) -#define sh2_remove_l1_shadow INTERNAL_NAME(sh2_remove_l1_shadow) -#define sh2_remove_l2_shadow INTERNAL_NAME(sh2_remove_l2_shadow) -#define sh2_remove_l3_shadow INTERNAL_NAME(sh2_remove_l3_shadow) -#define sh2_map_and_validate_gl4e INTERNAL_NAME(sh2_map_and_validate_gl4e) -#define sh2_map_and_validate_gl3e INTERNAL_NAME(sh2_map_and_validate_gl3e) -#define sh2_map_and_validate_gl2e INTERNAL_NAME(sh2_map_and_validate_gl2e) -#define sh2_map_and_validate_gl2he INTERNAL_NAME(sh2_map_and_validate_gl2he) -#define sh2_map_and_validate_gl1e INTERNAL_NAME(sh2_map_and_validate_gl1e) -#define sh2_destroy_l4_shadow INTERNAL_NAME(sh2_destroy_l4_shadow) -#define sh2_destroy_l3_shadow INTERNAL_NAME(sh2_destroy_l3_shadow) -#define sh2_destroy_l3_subshadow INTERNAL_NAME(sh2_destroy_l3_subshadow) -#define sh2_unpin_all_l3_subshadows INTERNAL_NAME(sh2_unpin_all_l3_subshadows) -#define sh2_destroy_l2_shadow INTERNAL_NAME(sh2_destroy_l2_shadow) -#define sh2_destroy_l1_shadow INTERNAL_NAME(sh2_destroy_l1_shadow) -#define sh2_unhook_32b_mappings INTERNAL_NAME(sh2_unhook_32b_mappings) -#define sh2_unhook_pae_mappings INTERNAL_NAME(sh2_unhook_pae_mappings) -#define sh2_unhook_64b_mappings INTERNAL_NAME(sh2_unhook_64b_mappings) -#define sh2_paging_mode INTERNAL_NAME(sh2_paging_mode) -#define sh2_detach_old_tables INTERNAL_NAME(sh2_detach_old_tables) -#define sh2_x86_emulate_write INTERNAL_NAME(sh2_x86_emulate_write) -#define sh2_x86_emulate_cmpxchg INTERNAL_NAME(sh2_x86_emulate_cmpxchg) -#define sh2_x86_emulate_cmpxchg8b INTERNAL_NAME(sh2_x86_emulate_cmpxchg8b) -#define sh2_audit_l1_table INTERNAL_NAME(sh2_audit_l1_table) -#define sh2_audit_fl1_table INTERNAL_NAME(sh2_audit_fl1_table) -#define sh2_audit_l2_table INTERNAL_NAME(sh2_audit_l2_table) -#define sh2_audit_l3_table INTERNAL_NAME(sh2_audit_l3_table) -#define sh2_audit_l4_table INTERNAL_NAME(sh2_audit_l4_table) -#define sh2_guess_wrmap INTERNAL_NAME(sh2_guess_wrmap) -#define sh2_clear_shadow_entry INTERNAL_NAME(sh2_clear_shadow_entry) - -/* sh2_make_monitor_table only depends on the number of shadow levels */ -#define sh2_make_monitor_table \ - SHADOW2_INTERNAL_NAME(sh2_make_monitor_table, \ - SHADOW_PAGING_LEVELS, \ - SHADOW_PAGING_LEVELS) -#define sh2_destroy_monitor_table \ - SHADOW2_INTERNAL_NAME(sh2_destroy_monitor_table, \ - SHADOW_PAGING_LEVELS, \ - SHADOW_PAGING_LEVELS) - - -#if GUEST_PAGING_LEVELS == 3 -/* - * Accounting information stored in the shadow of PAE Guest L3 pages. - * Because these "L3 pages" are only 32-bytes, it is inconvenient to keep - * various refcounts, etc., on the page_info of their page. We provide extra - * bookkeeping space in the shadow itself, and this is the structure - * definition for that bookkeeping information. - */ -struct pae_l3_bookkeeping { - u32 vcpus; /* bitmap of which vcpus are currently storing - * copies of this 32-byte page */ - u32 refcount; /* refcount for this 32-byte page */ - u8 pinned; /* is this 32-byte page pinned or not? */ -}; - -// Convert a shadow entry pointer into a pae_l3_bookkeeping pointer. -#define sl3p_to_info(_ptr) ((struct pae_l3_bookkeeping *) \ - (((unsigned long)(_ptr) & ~31) + 32)) - -static void sh2_destroy_l3_subshadow(struct vcpu *v, - shadow_l3e_t *sl3e); - -/* Increment a subshadow ref - * Called with a pointer to the subshadow, and the mfn of the - * *first* page of the overall shadow. */ -static inline void sh2_get_ref_l3_subshadow(shadow_l3e_t *sl3e, mfn_t smfn) -{ - struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e); - - /* First ref to the subshadow takes a ref to the full shadow */ - if ( bk->refcount == 0 ) - sh2_get_ref(smfn, 0); - if ( unlikely(++(bk->refcount) == 0) ) - { - SHADOW2_PRINTK("shadow l3 subshadow ref overflow, smfn=%" SH2_PRI_mfn " sh=%p\n", - mfn_x(smfn), sl3e); - domain_crash_synchronous(); - } -} - -/* Decrement a subshadow ref. - * Called with a pointer to the subshadow, and the mfn of the - * *first* page of the overall shadow. Calling this may cause the - * entire shadow to disappear, so the caller must immediately unmap - * the pointer after calling. */ -static inline void sh2_put_ref_l3_subshadow(struct vcpu *v, - shadow_l3e_t *sl3e, - mfn_t smfn) -{ - struct pae_l3_bookkeeping *bk; - - bk = sl3p_to_info(sl3e); - - ASSERT(bk->refcount > 0); - if ( --(bk->refcount) == 0 ) - { - /* Need to destroy this subshadow */ - sh2_destroy_l3_subshadow(v, sl3e); - /* Last ref to the subshadow had a ref to the full shadow */ - sh2_put_ref(v, smfn, 0); - } -} - -/* Pin a subshadow - * Called with a pointer to the subshadow, and the mfn of the - * *first* page of the overall shadow. */ -static inline void sh2_pin_l3_subshadow(shadow_l3e_t *sl3e, mfn_t smfn) -{ - struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e); - -#if 0 - debugtrace_printk("%s smfn=%05lx offset=%ld\n", - __func__, mfn_x(smfn), - ((unsigned long)sl3e & ~PAGE_MASK) / 64); -#endif - - if ( !bk->pinned ) - { - bk->pinned = 1; - sh2_get_ref_l3_subshadow(sl3e, smfn); - } -} - -/* Unpin a sub-shadow. - * Called with a pointer to the subshadow, and the mfn of the - * *first* page of the overall shadow. Calling this may cause the - * entire shadow to disappear, so the caller must immediately unmap - * the pointer after calling. */ -static inline void sh2_unpin_l3_subshadow(struct vcpu *v, - shadow_l3e_t *sl3e, - mfn_t smfn) -{ - struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e); - -#if 0 - debugtrace_printk("%s smfn=%05lx offset=%ld\n", - __func__, mfn_x(smfn), - ((unsigned long)sl3e & ~PAGE_MASK) / 64); -#endif - - if ( bk->pinned ) - { - bk->pinned = 0; - sh2_put_ref_l3_subshadow(v, sl3e, smfn); - } -} - -#endif /* GUEST_PAGING_LEVELS == 3 */ - -#if SHADOW_PAGING_LEVELS == 3 -#define MFN_FITS_IN_HVM_CR3(_MFN) !(mfn_x(_MFN) >> 20) -#endif - -#if SHADOW_PAGING_LEVELS == 2 -#define SH2_PRI_pte "08x" -#else /* SHADOW_PAGING_LEVELS >= 3 */ -#ifndef __x86_64__ -#define SH2_PRI_pte "016llx" -#else -#define SH2_PRI_pte "016lx" -#endif -#endif /* SHADOW_PAGING_LEVELS >= 3 */ - -#if GUEST_PAGING_LEVELS == 2 -#define SH2_PRI_gpte "08x" -#else /* GUEST_PAGING_LEVELS >= 3 */ -#ifndef __x86_64__ -#define SH2_PRI_gpte "016llx" -#else -#define SH2_PRI_gpte "016lx" -#endif -#endif /* GUEST_PAGING_LEVELS >= 3 */ - -static inline u32 -accumulate_guest_flags(walk_t *gw) -{ - u32 accumulated_flags; - - // We accumulate the permission flags with bitwise ANDing. - // This works for the PRESENT bit, RW bit, and USER bit. - // For the NX bit, however, the polarity is wrong, so we accumulate the - // inverse of the NX bit. - // - accumulated_flags = guest_l1e_get_flags(gw->eff_l1e) ^ _PAGE_NX_BIT; - accumulated_flags &= guest_l2e_get_flags(*gw->l2e) ^ _PAGE_NX_BIT; - - // Note that PAE guests do not have USER or RW or NX bits in their L3s. - // -#if GUEST_PAGING_LEVELS == 3 - accumulated_flags &= - ~_PAGE_PRESENT | (guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT); -#elif GUEST_PAGING_LEVELS >= 4 - accumulated_flags &= guest_l3e_get_flags(*gw->l3e) ^ _PAGE_NX_BIT; - accumulated_flags &= guest_l4e_get_flags(*gw->l4e) ^ _PAGE_NX_BIT; -#endif - - // Finally, revert the NX bit back to its original polarity - accumulated_flags ^= _PAGE_NX_BIT; - - return accumulated_flags; -} - -#endif /* _XEN_SHADOW2_TYPES_H */ - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/xen/include/asm-x86/shadow2.h b/xen/include/asm-x86/shadow2.h deleted file mode 100644 index d5b56ae16e..0000000000 --- a/xen/include/asm-x86/shadow2.h +++ /dev/null @@ -1,626 +0,0 @@ -/****************************************************************************** - * include/asm-x86/shadow2.h - * - * Parts of this code are Copyright (c) 2006 by XenSource Inc. - * Parts of this code are Copyright (c) 2006 by Michael A Fetterman - * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef _XEN_SHADOW2_H -#define _XEN_SHADOW2_H - -#include -#include -#include -#include - -/* Shadow PT operation mode : shadow-mode variable in arch_domain. */ - -#define SHM2_shift 10 -/* We're in one of the shadow modes */ -#define SHM2_enable (1U << SHM2_shift) -/* Refcounts based on shadow tables instead of guest tables */ -#define SHM2_refcounts (XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT << SHM2_shift) -/* Enable log dirty mode */ -#define SHM2_log_dirty (XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY << SHM2_shift) -/* Xen does p2m translation, not guest */ -#define SHM2_translate (XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE << SHM2_shift) -/* Xen does not steal address space from the domain for its own booking; - * requires VT or similar mechanisms */ -#define SHM2_external (XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL << SHM2_shift) - -#define shadow2_mode_enabled(_d) ((_d)->arch.shadow2.mode) -#define shadow2_mode_refcounts(_d) ((_d)->arch.shadow2.mode & SHM2_refcounts) -#define shadow2_mode_log_dirty(_d) ((_d)->arch.shadow2.mode & SHM2_log_dirty) -#define shadow2_mode_translate(_d) ((_d)->arch.shadow2.mode & SHM2_translate) -#define shadow2_mode_external(_d) ((_d)->arch.shadow2.mode & SHM2_external) - -/* Xen traps & emulates all reads of all page table pages: - *not yet supported - */ -#define shadow2_mode_trap_reads(_d) ({ (void)(_d); 0; }) - -// flags used in the return value of the shadow_set_lXe() functions... -#define SHADOW2_SET_CHANGED 0x1 -#define SHADOW2_SET_FLUSH 0x2 -#define SHADOW2_SET_ERROR 0x4 -#define SHADOW2_SET_L3PAE_RECOPY 0x8 - -// How do we tell that we have a 32-bit PV guest in a 64-bit Xen? -#ifdef __x86_64__ -#define pv_32bit_guest(_v) 0 // not yet supported -#else -#define pv_32bit_guest(_v) !hvm_guest(v) -#endif - -/* The shadow2 lock. - * - * This lock is per-domain. It is intended to allow us to make atomic - * updates to the software TLB that the shadow tables provide. - * - * Specifically, it protects: - * - all changes to shadow page table pages - * - the shadow hash table - * - the shadow page allocator - * - all changes to guest page table pages; if/when the notion of - * out-of-sync pages is added to this code, then the shadow lock is - * protecting all guest page table pages which are not listed as - * currently as both guest-writable and out-of-sync... - * XXX -- need to think about this relative to writable page tables. - * - all changes to the page_info->tlbflush_timestamp - * - the page_info->count fields on shadow pages - * - the shadow dirty bit array and count - * - XXX - */ -#ifndef CONFIG_SMP -#error shadow2.h currently requires CONFIG_SMP -#endif - -#define shadow2_lock_init(_d) \ - do { \ - spin_lock_init(&(_d)->arch.shadow2.lock); \ - (_d)->arch.shadow2.locker = -1; \ - (_d)->arch.shadow2.locker_function = "nobody"; \ - } while (0) - -#define shadow2_lock_is_acquired(_d) \ - (current->processor == (_d)->arch.shadow2.locker) - -#define shadow2_lock(_d) \ - do { \ - if ( unlikely((_d)->arch.shadow2.locker == current->processor) ) \ - { \ - printk("Error: shadow2 lock held by %s\n", \ - (_d)->arch.shadow2.locker_function); \ - BUG(); \ - } \ - spin_lock(&(_d)->arch.shadow2.lock); \ - ASSERT((_d)->arch.shadow2.locker == -1); \ - (_d)->arch.shadow2.locker = current->processor; \ - (_d)->arch.shadow2.locker_function = __func__; \ - } while (0) - -#define shadow2_unlock(_d) \ - do { \ - ASSERT((_d)->arch.shadow2.locker == current->processor); \ - (_d)->arch.shadow2.locker = -1; \ - (_d)->arch.shadow2.locker_function = "nobody"; \ - spin_unlock(&(_d)->arch.shadow2.lock); \ - } while (0) - -/* - * Levels of self-test and paranoia - * XXX should go in config files somewhere? - */ -#define SHADOW2_AUDIT_HASH 0x01 /* Check current hash bucket */ -#define SHADOW2_AUDIT_HASH_FULL 0x02 /* Check every hash bucket */ -#define SHADOW2_AUDIT_ENTRIES 0x04 /* Check this walk's shadows */ -#define SHADOW2_AUDIT_ENTRIES_FULL 0x08 /* Check every shadow */ -#define SHADOW2_AUDIT_ENTRIES_MFNS 0x10 /* Check gfn-mfn map in shadows */ -#define SHADOW2_AUDIT_P2M 0x20 /* Check the p2m table */ - -#ifdef NDEBUG -#define SHADOW2_AUDIT 0 -#define SHADOW2_AUDIT_ENABLE 0 -#else -#define SHADOW2_AUDIT 0x15 /* Basic audit of all except p2m. */ -#define SHADOW2_AUDIT_ENABLE shadow2_audit_enable -extern int shadow2_audit_enable; -#endif - -/* - * Levels of optimization - * XXX should go in config files somewhere? - */ -#define SH2OPT_WRITABLE_HEURISTIC 0x01 /* Guess at RW PTEs via linear maps */ -#define SH2OPT_EARLY_UNSHADOW 0x02 /* Unshadow l1s on fork or exit */ - -#define SHADOW2_OPTIMIZATIONS 0x03 - - -/* With shadow pagetables, the different kinds of address start - * to get get confusing. - * - * Virtual addresses are what they usually are: the addresses that are used - * to accessing memory while the guest is running. The MMU translates from - * virtual addresses to machine addresses. - * - * (Pseudo-)physical addresses are the abstraction of physical memory the - * guest uses for allocation and so forth. For the purposes of this code, - * we can largely ignore them. - * - * Guest frame numbers (gfns) are the entries that the guest puts in its - * pagetables. For normal paravirtual guests, they are actual frame numbers, - * with the translation done by the guest. - * - * Machine frame numbers (mfns) are the entries that the hypervisor puts - * in the shadow page tables. - * - * Elsewhere in the xen code base, the name "gmfn" is generally used to refer - * to a "machine frame number, from the guest's perspective", or in other - * words, pseudo-physical frame numbers. However, in the shadow code, the - * term "gmfn" means "the mfn of a guest page"; this combines naturally with - * other terms such as "smfn" (the mfn of a shadow page), gl2mfn (the mfn of a - * guest L2 page), etc... - */ - -/* With this defined, we do some ugly things to force the compiler to - * give us type safety between mfns and gfns and other integers. - * TYPE_SAFE(int foo) defines a foo_t, and _foo() and foo_x() functions - * that translate beween int and foo_t. - * - * It does have some performance cost because the types now have - * a different storage attribute, so may not want it on all the time. */ -#ifndef NDEBUG -#define TYPE_SAFETY 1 -#endif - -#ifdef TYPE_SAFETY -#define TYPE_SAFE(_type,_name) \ -typedef struct { _type _name; } _name##_t; \ -static inline _name##_t _##_name(_type n) { return (_name##_t) { n }; } \ -static inline _type _name##_x(_name##_t n) { return n._name; } -#else -#define TYPE_SAFE(_type,_name) \ -typedef _type _name##_t; \ -static inline _name##_t _##_name(_type n) { return n; } \ -static inline _type _name##_x(_name##_t n) { return n; } -#endif - -TYPE_SAFE(unsigned long,mfn) -#define SH2_PRI_mfn "05lx" - -static inline int -valid_mfn(mfn_t m) -{ - return VALID_MFN(mfn_x(m)); -} - -static inline mfn_t -pagetable_get_mfn(pagetable_t pt) -{ - return _mfn(pagetable_get_pfn(pt)); -} - -static inline pagetable_t -pagetable_from_mfn(mfn_t mfn) -{ - return pagetable_from_pfn(mfn_x(mfn)); -} - -static inline int -shadow2_vcpu_mode_translate(struct vcpu *v) -{ - // Returns true if this VCPU needs to be using the P2M table to translate - // between GFNs and MFNs. - // - // This is true of translated HVM domains on a vcpu which has paging - // enabled. (HVM vcpu's with paging disabled are using the p2m table as - // its paging table, so no translation occurs in this case.) - // - return v->arch.shadow2.hvm_paging_enabled; -} - - -/**************************************************************************/ -/* Mode-specific entry points into the shadow code */ - -struct x86_emulate_ctxt; -struct shadow2_paging_mode { - int (*page_fault )(struct vcpu *v, unsigned long va, - struct cpu_user_regs *regs); - int (*invlpg )(struct vcpu *v, unsigned long va); - unsigned long (*gva_to_gpa )(struct vcpu *v, unsigned long va); - unsigned long (*gva_to_gfn )(struct vcpu *v, unsigned long va); - void (*update_cr3 )(struct vcpu *v); - int (*map_and_validate_gl1e )(struct vcpu *v, mfn_t gmfn, - void *new_guest_entry, u32 size); - int (*map_and_validate_gl2e )(struct vcpu *v, mfn_t gmfn, - void *new_guest_entry, u32 size); - int (*map_and_validate_gl2he)(struct vcpu *v, mfn_t gmfn, - void *new_guest_entry, u32 size); - int (*map_and_validate_gl3e )(struct vcpu *v, mfn_t gmfn, - void *new_guest_entry, u32 size); - int (*map_and_validate_gl4e )(struct vcpu *v, mfn_t gmfn, - void *new_guest_entry, u32 size); - void (*detach_old_tables )(struct vcpu *v); - int (*x86_emulate_write )(struct vcpu *v, unsigned long va, - void *src, u32 bytes, - struct x86_emulate_ctxt *ctxt); - int (*x86_emulate_cmpxchg )(struct vcpu *v, unsigned long va, - unsigned long old, - unsigned long new, - unsigned int bytes, - struct x86_emulate_ctxt *ctxt); - int (*x86_emulate_cmpxchg8b )(struct vcpu *v, unsigned long va, - unsigned long old_lo, - unsigned long old_hi, - unsigned long new_lo, - unsigned long new_hi, - struct x86_emulate_ctxt *ctxt); - mfn_t (*make_monitor_table )(struct vcpu *v); - void (*destroy_monitor_table )(struct vcpu *v, mfn_t mmfn); -#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC - int (*guess_wrmap )(struct vcpu *v, - unsigned long vaddr, mfn_t gmfn); -#endif - /* For outsiders to tell what mode we're in */ - unsigned int shadow_levels; - unsigned int guest_levels; -}; - -static inline int shadow2_guest_paging_levels(struct vcpu *v) -{ - ASSERT(v->arch.shadow2.mode != NULL); - return v->arch.shadow2.mode->guest_levels; -} - -/**************************************************************************/ -/* Entry points into the shadow code */ - -/* Turning on shadow2 test mode */ -int shadow2_test_enable(struct domain *d); - -/* Handler for shadow control ops: enabling and disabling shadow modes, - * and log-dirty bitmap ops all happen through here. */ -int shadow2_domctl(struct domain *d, - xen_domctl_shadow_op_t *sc, - XEN_GUEST_HANDLE(xen_domctl_t) u_domctl); - -/* Call when destroying a domain */ -void shadow2_teardown(struct domain *d); - -/* Call once all of the references to the domain have gone away */ -void shadow2_final_teardown(struct domain *d); - - -/* Mark a page as dirty in the bitmap */ -void sh2_do_mark_dirty(struct domain *d, mfn_t gmfn); -static inline void mark_dirty(struct domain *d, unsigned long gmfn) -{ - if ( shadow2_mode_log_dirty(d) ) - { - shadow2_lock(d); - sh2_do_mark_dirty(d, _mfn(gmfn)); - shadow2_unlock(d); - } -} - -/* Internal version, for when the shadow lock is already held */ -static inline void sh2_mark_dirty(struct domain *d, mfn_t gmfn) -{ - ASSERT(shadow2_lock_is_acquired(d)); - if ( shadow2_mode_log_dirty(d) ) - sh2_do_mark_dirty(d, gmfn); -} - -static inline int -shadow2_fault(unsigned long va, struct cpu_user_regs *regs) -/* Called from pagefault handler in Xen, and from the HVM trap handlers - * for pagefaults. Returns 1 if this fault was an artefact of the - * shadow code (and the guest should retry) or 0 if it is not (and the - * fault should be handled elsewhere or passed to the guest). */ -{ - struct vcpu *v = current; - perfc_incrc(shadow2_fault); - return v->arch.shadow2.mode->page_fault(v, va, regs); -} - -static inline int -shadow2_invlpg(struct vcpu *v, unsigned long va) -/* Called when the guest requests an invlpg. Returns 1 if the invlpg - * instruction should be issued on the hardware, or 0 if it's safe not - * to do so. */ -{ - return v->arch.shadow2.mode->invlpg(v, va); -} - -static inline unsigned long -shadow2_gva_to_gpa(struct vcpu *v, unsigned long va) -/* Called to translate a guest virtual address to what the *guest* - * pagetables would map it to. */ -{ - return v->arch.shadow2.mode->gva_to_gpa(v, va); -} - -static inline unsigned long -shadow2_gva_to_gfn(struct vcpu *v, unsigned long va) -/* Called to translate a guest virtual address to what the *guest* - * pagetables would map it to. */ -{ - return v->arch.shadow2.mode->gva_to_gfn(v, va); -} - -static inline void -shadow2_update_cr3(struct vcpu *v) -/* Updates all the things that are derived from the guest's CR3. - * Called when the guest changes CR3. */ -{ - shadow2_lock(v->domain); - v->arch.shadow2.mode->update_cr3(v); - shadow2_unlock(v->domain); -} - - -/* Should be called after CR3 is updated. - * Updates vcpu->arch.cr3 and, for HVM guests, vcpu->arch.hvm_vcpu.cpu_cr3. - * - * Also updates other state derived from CR3 (vcpu->arch.guest_vtable, - * shadow_vtable, etc). - * - * Uses values found in vcpu->arch.(guest_table and guest_table_user), and - * for HVM guests, arch.monitor_table and hvm's guest CR3. - * - * Update ref counts to shadow tables appropriately. - * For PAE, relocate L3 entries, if necessary, into low memory. - */ -static inline void update_cr3(struct vcpu *v) -{ - unsigned long cr3_mfn=0; - - if ( shadow2_mode_enabled(v->domain) ) - { - shadow2_update_cr3(v); - return; - } - -#if CONFIG_PAGING_LEVELS == 4 - if ( !(v->arch.flags & TF_kernel_mode) ) - cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user); - else -#endif - cr3_mfn = pagetable_get_pfn(v->arch.guest_table); - - make_cr3(v, cr3_mfn); -} - -extern void sh2_update_paging_modes(struct vcpu *v); - -/* Should be called to initialise paging structures if the paging mode - * has changed, and when bringing up a VCPU for the first time. */ -static inline void shadow2_update_paging_modes(struct vcpu *v) -{ - ASSERT(shadow2_mode_enabled(v->domain)); - shadow2_lock(v->domain); - sh2_update_paging_modes(v); - shadow2_unlock(v->domain); -} - -static inline void -shadow2_detach_old_tables(struct vcpu *v) -{ - if ( v->arch.shadow2.mode ) - v->arch.shadow2.mode->detach_old_tables(v); -} - -static inline mfn_t -shadow2_make_monitor_table(struct vcpu *v) -{ - return v->arch.shadow2.mode->make_monitor_table(v); -} - -static inline void -shadow2_destroy_monitor_table(struct vcpu *v, mfn_t mmfn) -{ - v->arch.shadow2.mode->destroy_monitor_table(v, mmfn); -} - -/* Validate a pagetable change from the guest and update the shadows. */ -extern int shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, - void *new_guest_entry); - -/* Update the shadows in response to a pagetable write from a HVM guest */ -extern void shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, - void *entry, u32 size); - -/* Remove all writeable mappings of a guest frame from the shadows. - * Returns non-zero if we need to flush TLBs. - * level and fault_addr desribe how we found this to be a pagetable; - * level==0 means we have some other reason for revoking write access. */ -extern int shadow2_remove_write_access(struct vcpu *v, mfn_t readonly_mfn, - unsigned int level, - unsigned long fault_addr); - -/* Remove all mappings of the guest mfn from the shadows. - * Returns non-zero if we need to flush TLBs. */ -extern int shadow2_remove_all_mappings(struct vcpu *v, mfn_t target_mfn); - -void -shadow2_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn); -/* This is a HVM page that we thing is no longer a pagetable. - * Unshadow it, and recursively unshadow pages that reference it. */ - -/* Remove all shadows of the guest mfn. */ -extern void sh2_remove_shadows(struct vcpu *v, mfn_t gmfn, int all); -static inline void shadow2_remove_all_shadows(struct vcpu *v, mfn_t gmfn) -{ - sh2_remove_shadows(v, gmfn, 1); -} - -/* Add a page to a domain */ -void -shadow2_guest_physmap_add_page(struct domain *d, unsigned long gfn, - unsigned long mfn); - -/* Remove a page from a domain */ -void -shadow2_guest_physmap_remove_page(struct domain *d, unsigned long gfn, - unsigned long mfn); - -/* - * Definitions for the shadow2_flags field in page_info. - * These flags are stored on *guest* pages... - * Bits 1-13 are encodings for the shadow types. - */ -#define PGC_SH2_type_to_index(_type) ((_type) >> PGC_SH2_type_shift) -#define SH2F_page_type_mask \ - (((1u << (PGC_SH2_type_to_index(PGC_SH2_max_shadow) + 1u)) - 1u) - \ - ((1u << PGC_SH2_type_to_index(PGC_SH2_min_shadow)) - 1u)) - -#define SH2F_L1_32 (1u << PGC_SH2_type_to_index(PGC_SH2_l1_32_shadow)) -#define SH2F_FL1_32 (1u << PGC_SH2_type_to_index(PGC_SH2_fl1_32_shadow)) -#define SH2F_L2_32 (1u << PGC_SH2_type_to_index(PGC_SH2_l2_32_shadow)) -#define SH2F_L1_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l1_pae_shadow)) -#define SH2F_FL1_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_fl1_pae_shadow)) -#define SH2F_L2_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l2_pae_shadow)) -#define SH2F_L2H_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l2h_pae_shadow)) -#define SH2F_L3_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l3_pae_shadow)) -#define SH2F_L1_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l1_64_shadow)) -#define SH2F_FL1_64 (1u << PGC_SH2_type_to_index(PGC_SH2_fl1_64_shadow)) -#define SH2F_L2_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l2_64_shadow)) -#define SH2F_L3_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l3_64_shadow)) -#define SH2F_L4_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l4_64_shadow)) - -/* Used for hysteresis when automatically unhooking mappings on fork/exit */ -#define SH2F_unhooked_mappings (1u<<31) - -/* - * Allocation of shadow pages - */ - -/* Return the minumum acceptable number of shadow pages a domain needs */ -unsigned int shadow2_min_acceptable_pages(struct domain *d); - -/* Set the pool of shadow pages to the required number of MB. - * Input will be rounded up to at least min_acceptable_shadow_pages(). - * Returns 0 for success, 1 for failure. */ -unsigned int shadow2_set_allocation(struct domain *d, - unsigned int megabytes, - int *preempted); - -/* Return the size of the shadow2 pool, rounded up to the nearest MB */ -static inline unsigned int shadow2_get_allocation(struct domain *d) -{ - unsigned int pg = d->arch.shadow2.total_pages; - return ((pg >> (20 - PAGE_SHIFT)) - + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0)); -} - -/* - * Linked list for chaining entries in the shadow hash table. - */ -struct shadow2_hash_entry { - struct shadow2_hash_entry *next; - mfn_t smfn; /* MFN of the shadow */ -#ifdef _x86_64_ /* Shorten 'n' so we don't waste a whole word on storing 't' */ - unsigned long n:56; /* MFN of guest PT or GFN of guest superpage */ -#else - unsigned long n; /* MFN of guest PT or GFN of guest superpage */ -#endif - unsigned char t; /* shadow type bits, or 0 for empty */ -}; - -#define SHADOW2_HASH_BUCKETS 251 -/* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */ - - -#if SHADOW2_OPTIMIZATIONS & SH2OPT_CACHE_WALKS -/* Optimization: cache the results of guest walks. This helps with MMIO - * and emulated writes, which tend to issue very similar walk requests - * repeatedly. We keep the results of the last few walks, and blow - * away the cache on guest cr3 write, mode change, or page fault. */ - -#define SH2_WALK_CACHE_ENTRIES 4 - -/* Rather than cache a guest walk, which would include mapped pointers - * to pages, we cache what a TLB would remember about the walk: the - * permissions and the l1 gfn */ -struct shadow2_walk_cache { - unsigned long va; /* The virtual address (or 0 == unused) */ - unsigned long gfn; /* The gfn from the effective l1e */ - u32 permissions; /* The aggregated permission bits */ -}; -#endif - - -/**************************************************************************/ -/* Guest physmap (p2m) support */ - -/* Walk another domain's P2M table, mapping pages as we go */ -extern mfn_t -sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn); - - -/* General conversion function from gfn to mfn */ -static inline mfn_t -sh2_gfn_to_mfn(struct domain *d, unsigned long gfn) -{ - if ( !shadow2_mode_translate(d) ) - return _mfn(gfn); - else if ( likely(current->domain == d) ) - return _mfn(get_mfn_from_gpfn(gfn)); - else - return sh2_gfn_to_mfn_foreign(d, gfn); -} - -// vcpu-specific version of gfn_to_mfn(). This is where we hide the dirty -// little secret that, for hvm guests with paging disabled, nearly all of the -// shadow code actually think that the guest is running on *untranslated* page -// tables (which is actually domain->phys_table). -// -static inline mfn_t -sh2_vcpu_gfn_to_mfn(struct vcpu *v, unsigned long gfn) -{ - if ( !shadow2_vcpu_mode_translate(v) ) - return _mfn(gfn); - if ( likely(current->domain == v->domain) ) - return _mfn(get_mfn_from_gpfn(gfn)); - return sh2_gfn_to_mfn_foreign(v->domain, gfn); -} - -static inline unsigned long -sh2_mfn_to_gfn(struct domain *d, mfn_t mfn) -{ - if ( shadow2_mode_translate(d) ) - return get_gpfn_from_mfn(mfn_x(mfn)); - else - return mfn_x(mfn); -} - - - -#endif /* _XEN_SHADOW2_H */ - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * indent-tabs-mode: nil - * End: - */ -